CLAHE Python bindings
[profile/ivi/opencv.git] / modules / ocl / src / initialization.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // @Authors
19 //    Guoping Long, longguoping@gmail.com
20 //    Niko Li, newlife20080214@gmail.com
21 //    Yao Wang, bitwangyaoyao@gmail.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
24 //
25 //   * Redistribution's of source code must retain the above copyright notice,
26 //     this list of conditions and the following disclaimer.
27 //
28 //   * Redistribution's in binary form must reproduce the above copyright notice,
29 //     this list of conditions and the following disclaimer in the documentation
30 //     and/or other oclMaterials provided with the distribution.
31 //
32 //   * The name of the copyright holders may not be used to endorse or promote products
33 //     derived from this software without specific prior written permission.
34 //
35 // This software is provided by the copyright holders and contributors "as is" and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
45 //
46 //M*/
47
48 #include "precomp.hpp"
49 #include <iomanip>
50 #include <fstream>
51 #include "binarycaching.hpp"
52
53 using namespace cv;
54 using namespace cv::ocl;
55 using namespace std;
56 using std::cout;
57 using std::endl;
58
59 //#define PRINT_KERNEL_RUN_TIME
60 #define RUN_TIMES 100
61 #ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
62 #define CL_MEM_USE_PERSISTENT_MEM_AMD 0
63 #endif
64 //#define AMD_DOUBLE_DIFFER
65
66 namespace cv
67 {
68     namespace ocl
69     {
70         extern void fft_teardown();
71         /*
72          * The binary caching system to eliminate redundant program source compilation.
73          * Strictly, this is not a cache because we do not implement evictions right now.
74          * We shall add such features to trade-off memory consumption and performance when necessary.
75          */
76         auto_ptr<ProgramCache> ProgramCache::programCache;
77         ProgramCache *programCache = NULL;
78         DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
79         DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
80         int gDevMemTypeValueMap[5] = {0,
81                                       CL_MEM_ALLOC_HOST_PTR,
82                                       CL_MEM_USE_HOST_PTR,
83                                       CL_MEM_COPY_HOST_PTR,
84                                       CL_MEM_USE_PERSISTENT_MEM_AMD};
85         int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
86
87         ProgramCache::ProgramCache()
88         {
89             codeCache.clear();
90             cacheSize = 0;
91         }
92
93         ProgramCache::~ProgramCache()
94         {
95             releaseProgram();
96         }
97
98         cl_program ProgramCache::progLookup(string srcsign)
99         {
100             map<string, cl_program>::iterator iter;
101             iter = codeCache.find(srcsign);
102             if(iter != codeCache.end())
103                 return iter->second;
104             else
105                 return NULL;
106         }
107
108         void ProgramCache::addProgram(string srcsign , cl_program program)
109         {
110             if(!progLookup(srcsign))
111             {
112                 codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
113             }
114         }
115
116         void ProgramCache::releaseProgram()
117         {
118             map<string, cl_program>::iterator iter;
119             for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
120             {
121                 openCLSafeCall(clReleaseProgram(iter->second));
122             }
123             codeCache.clear();
124             cacheSize = 0;
125         }
126
127         // not to be exported to dynamic lib
128         void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl);
129         struct Info::Impl
130         {
131             cl_platform_id oclplatform;
132             std::vector<cl_device_id> devices;
133             std::vector<std::string> devName;
134             std::string clVersion;
135
136             cl_context oclcontext;
137             cl_command_queue clCmdQueue;
138             int devnum;
139             size_t maxWorkGroupSize;
140             cl_uint maxDimensions; // == maxWorkItemSizes.size()
141             std::vector<size_t> maxWorkItemSizes;
142             cl_uint maxComputeUnits;
143             char extra_options[512];
144             int  double_support;
145             int unified_memory; //1 means integrated GPU, otherwise this value is 0
146             bool enable_disk_cache; 
147             bool update_disk_cache;
148             string binpath;
149             int refcounter;
150
151             Impl();
152
153             void setDevice(void *ctx, void *q, int devnum);
154
155             void release()
156             {
157                 if(1 == CV_XADD(&refcounter, -1))
158                 {
159                     releaseResources();
160                     delete this;
161                 }
162             }
163
164             Impl* copy()
165             {
166                 CV_XADD(&refcounter, 1);
167                 return this;
168             }
169
170         private:
171             Impl(const Impl&);
172             Impl& operator=(const Impl&);
173             void releaseResources();
174         };
175
176         Info::Impl::Impl()
177             :oclplatform(0),
178             oclcontext(0),
179             clCmdQueue(0),
180             devnum(-1),
181             maxWorkGroupSize(0),
182             maxDimensions(0),
183             maxComputeUnits(0),
184             double_support(0),
185             unified_memory(0),
186             enable_disk_cache(false),
187             update_disk_cache(false),
188             binpath("./"),
189             refcounter(1)
190         {
191             memset(extra_options, 0, 512);
192             setBinaryDiskCacheImpl(CACHE_RELEASE, String("./"), this);
193         }
194
195         void Info::Impl::releaseResources()
196         {
197             devnum = -1;
198
199             if(clCmdQueue)
200             {
201                 openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
202                 clCmdQueue = 0;
203             }
204
205             if(oclcontext)
206             {
207                 openCLSafeCall(clReleaseContext(oclcontext));
208                 oclcontext = 0;
209             }
210         }
211
212         void Info::Impl::setDevice(void *ctx, void *q, int dnum)
213         {
214             if((ctx && q) || devnum != dnum)
215                 releaseResources();
216
217             CV_Assert(dnum >= 0 && dnum < (int)devices.size());
218             devnum = dnum;
219             if(ctx && q)
220             {
221                 oclcontext = (cl_context)ctx;
222                 clCmdQueue = (cl_command_queue)q;
223                 clRetainContext(oclcontext);
224                 clRetainCommandQueue(clCmdQueue);
225             }
226             else
227             {
228                 cl_int status = 0;
229                 cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
230                 oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
231                 openCLVerifyCall(status);
232                 clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
233                 openCLVerifyCall(status);
234             }
235
236             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
237             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
238             maxWorkItemSizes.resize(maxDimensions);
239             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
240             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
241
242             cl_bool unfymem = false;
243             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
244             unified_memory = unfymem ? 1 : 0;
245
246             //initialize extra options for compilation. Currently only fp64 is included.
247             //Assume 4KB is enough to store all possible extensions.
248             const int EXT_LEN = 4096 + 1 ;
249             char extends_set[EXT_LEN];
250             size_t extends_size;
251             openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
252             extends_set[EXT_LEN - 1] = 0;
253             size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
254
255             if(fp64_khr != std::string::npos)
256             {
257                 sprintf(extra_options, "-D DOUBLE_SUPPORT");
258                 double_support = 1;
259             }
260             else
261             {
262                 memset(extra_options, 0, 512);
263                 double_support = 0;
264             }
265         }
266
267         ////////////////////////Common OpenCL specific calls///////////////
268         int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
269         {
270             rw_type = gDeviceMemRW;
271             mem_type = gDeviceMemType;
272             return Context::getContext()->impl->unified_memory;
273         }
274
275         int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
276         {
277             if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
278                  mem_type == DEVICE_MEM_UHP ||
279                  mem_type == DEVICE_MEM_CHP )
280                 return -1;
281             gDeviceMemRW = rw_type;
282             gDeviceMemType = mem_type;
283             return 0;
284         }
285
286         inline int divUp(int total, int grain)
287         {
288             return (total + grain - 1) / grain;
289         }
290
291         int getDevice(std::vector<Info> &oclinfo, int devicetype)
292         {
293             //TODO: cache oclinfo vector
294             oclinfo.clear();
295
296             switch(devicetype)
297             {
298             case CVCL_DEVICE_TYPE_DEFAULT:
299             case CVCL_DEVICE_TYPE_CPU:
300             case CVCL_DEVICE_TYPE_GPU:
301             case CVCL_DEVICE_TYPE_ACCELERATOR:
302             case CVCL_DEVICE_TYPE_ALL:
303                 break;
304             default:
305                 return 0;
306             }
307
308             // Platform info
309             cl_uint numPlatforms;
310             openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
311             if(numPlatforms < 1) return 0;
312
313             std::vector<cl_platform_id> platforms(numPlatforms);
314             openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
315
316             char deviceName[256];
317             int devcienums = 0;
318             char clVersion[256];
319             for (unsigned i = 0; i < numPlatforms; ++i)
320             {
321                 cl_uint numsdev;
322                 cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
323                 if(status != CL_DEVICE_NOT_FOUND)
324                     openCLVerifyCall(status);
325
326                 if(numsdev > 0)
327                 {
328                     devcienums += numsdev;
329                     std::vector<cl_device_id> devices(numsdev);
330                     openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
331
332                     Info ocltmpinfo;
333                     ocltmpinfo.impl->oclplatform = platforms[i];
334                     openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
335                     ocltmpinfo.impl->clVersion = clVersion;
336                     for(unsigned j = 0; j < numsdev; ++j)
337                     {
338                         ocltmpinfo.impl->devices.push_back(devices[j]);
339                         openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
340                         ocltmpinfo.impl->devName.push_back(deviceName);
341                         ocltmpinfo.DeviceName.push_back(deviceName);
342                     }
343                     oclinfo.push_back(ocltmpinfo);
344                 }
345             }
346             if(devcienums > 0)
347             {
348                 setDevice(oclinfo[0]);
349             }
350             return devcienums;
351         }
352
353         void setDevice(Info &oclinfo, int devnum)
354         {
355             oclinfo.impl->setDevice(0, 0, devnum);
356             Context::setContext(oclinfo);
357         }
358
359         void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
360         {
361             oclinfo.impl->setDevice(ctx, q, devnum);
362             Context::setContext(oclinfo);
363          }
364
365         void *getoclContext()
366         {
367             return &(Context::getContext()->impl->oclcontext);
368         }
369
370         void *getoclCommandQueue()
371         {
372             return &(Context::getContext()->impl->clCmdQueue);
373         }
374
375         void finish()
376         {
377             clFinish(Context::getContext()->impl->clCmdQueue);
378         }
379
380         //template specializations of queryDeviceInfo
381         template<>
382         bool queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel)
383         {
384             Info::Impl* impl = Context::getContext()->impl;
385             cl_device_type devicetype;
386             openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
387                 CL_DEVICE_TYPE, sizeof(cl_device_type),
388                 &devicetype, NULL));
389             return (devicetype == CVCL_DEVICE_TYPE_CPU);
390         }
391
392         template<typename _ty>
393         static _ty queryWavesize(cl_kernel kernel)
394         {
395             size_t info = 0;
396             Info::Impl* impl = Context::getContext()->impl;
397             bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
398             if(is_cpu)
399             {
400                 return 1;
401             }
402             CV_Assert(kernel != NULL);
403             openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
404                 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL));
405             return static_cast<_ty>(info);
406         }
407
408         template<>
409         size_t queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel)
410         {
411             return queryWavesize<size_t>(kernel);
412         }
413         template<>
414         int queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel)
415         {
416             return queryWavesize<int>(kernel);
417         }
418
419         void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
420         {
421             cl_int status;
422             status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0,
423                                          size, host_buffer, 0, NULL, NULL);
424             openCLVerifyCall(status);
425         }
426
427         cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
428         {
429             cl_int status;
430             cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
431             openCLVerifyCall(status);
432             return buffer;
433         }
434
435         void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
436                                size_t widthInBytes, size_t height)
437         {
438             openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
439         }
440
441         void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
442                                size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
443         {
444             cl_int status;
445             *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
446                                       widthInBytes * height, 0, &status);
447             openCLVerifyCall(status);
448             *pitch = widthInBytes;
449         }
450
451         void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
452                             const void *src, size_t spitch,
453                             size_t width, size_t height, openCLMemcpyKind kind, int channels)
454         {
455             size_t buffer_origin[3] = {0, 0, 0};
456             size_t host_origin[3] = {0, 0, 0};
457             size_t region[3] = {width, height, 1};
458             if(kind == clMemcpyHostToDevice)
459             {
460                 if(dpitch == width || channels == 3 || height == 1)
461                 {
462                     openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
463                                                         0, width * height, src, 0, NULL, NULL));
464                 }
465                 else
466                 {
467                     openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
468                                                             buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
469                 }
470             }
471             else if(kind == clMemcpyDeviceToHost)
472             {
473                 if(spitch == width || channels == 3 || height == 1)
474                 {
475                     openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
476                                                        0, width * height, dst, 0, NULL, NULL));
477                 }
478                 else
479                 {
480                     openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
481                                                            buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
482                 }
483             }
484         }
485
486         void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
487                                 const void *src, size_t spitch,
488                                 size_t width, size_t height, int src_offset)
489         {
490             size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
491             size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
492             size_t region[3] = {width, height, 1};
493
494             openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
495                                                    region, spitch, 0, dpitch, 0, 0, 0, 0));
496         }
497
498         void openCLFree(void *devPtr)
499         {
500             openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
501         }
502         cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName)
503         {
504             return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
505         }
506
507         void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl)
508         {
509             impl->update_disk_cache = (mode & CACHE_UPDATE) == CACHE_UPDATE;
510             impl->enable_disk_cache = 
511 #ifdef _DEBUG 
512                 (mode & CACHE_DEBUG)   == CACHE_DEBUG;
513 #else
514                 (mode & CACHE_RELEASE) == CACHE_RELEASE;
515 #endif
516             if(impl->enable_disk_cache && !path.empty())
517             {
518                 impl->binpath = path;
519             }
520         }
521         void setBinaryDiskCache(int mode, cv::String path)
522         {
523             setBinaryDiskCacheImpl(mode, path, Context::getContext()->impl);
524         }
525
526         void setBinpath(const char *path)
527         {
528             Context *clcxt = Context::getContext();
529             clcxt->impl->binpath = path;
530         }
531
532         int savetofile(const Context*,  cl_program &program, const char *fileName)
533         {
534             size_t binarySize;
535             openCLSafeCall(clGetProgramInfo(program,
536                                     CL_PROGRAM_BINARY_SIZES,
537                                     sizeof(size_t),
538                                     &binarySize, NULL));
539             char* binary = (char*)malloc(binarySize);
540             if(binary == NULL)
541             {
542                 CV_Error(CV_StsNoMem, "Failed to allocate host memory.");
543             }
544             openCLSafeCall(clGetProgramInfo(program,
545                                     CL_PROGRAM_BINARIES,
546                                     sizeof(char *),
547                                     &binary,
548                                     NULL));
549
550             FILE *fp = fopen(fileName, "wb+");
551             if(fp != NULL)
552             {
553                 fwrite(binary, binarySize, 1, fp);
554                 free(binary);
555                 fclose(fp);
556             }
557             return 1;
558         }
559
560         cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName,
561                                             const char *build_options)
562         {
563             cl_kernel kernel;
564             cl_program program ;
565             cl_int status = 0;
566             stringstream src_sign;
567             string srcsign;
568             string filename;
569             CV_Assert(programCache != NULL);
570
571             if(NULL != build_options)
572             {
573                 src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
574             }
575             else
576             {
577                 src_sign << (int64)(*source) << clCxt->impl->oclcontext;
578             }
579             srcsign = src_sign.str();
580
581             program = NULL;
582             program = programCache->progLookup(srcsign);
583
584             if(!program)
585             {
586                 //config build programs
587                 char all_build_options[1024];
588                 memset(all_build_options, 0, 1024);
589                 char zeromem[512] = {0};
590                 if(0 != memcmp(clCxt -> impl->extra_options, zeromem, 512))
591                     strcat(all_build_options, clCxt -> impl->extra_options);
592                 strcat(all_build_options, " ");
593                 if(build_options != NULL)
594                     strcat(all_build_options, build_options);
595                 if(all_build_options != NULL)
596                 {
597                     filename = clCxt->impl->binpath  + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
598                 }
599                 else
600                 {
601                     filename = clCxt->impl->binpath  + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
602                 }
603
604                 FILE *fp = clCxt->impl->enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
605                 if(fp == NULL || clCxt->impl->update_disk_cache)
606                 {
607                     if(fp != NULL)
608                         fclose(fp);
609
610                     program = clCreateProgramWithSource(
611                                   clCxt->impl->oclcontext, 1, source, NULL, &status);
612                     openCLVerifyCall(status);
613                     status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
614                     if(status == CL_SUCCESS && clCxt->impl->enable_disk_cache)
615                         savetofile(clCxt, program, filename.c_str());
616                 }
617                 else
618                 {
619                     fseek(fp, 0, SEEK_END);
620                     size_t binarySize = ftell(fp);
621                     fseek(fp, 0, SEEK_SET);
622                     char *binary = new char[binarySize];
623                     CV_Assert(1 == fread(binary, binarySize, 1, fp));
624                     fclose(fp);
625                     cl_int status = 0;
626                     program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
627                                                         1,
628                                                         &(clCxt->impl->devices[clCxt->impl->devnum]),
629                                                         (const size_t *)&binarySize,
630                                                         (const unsigned char **)&binary,
631                                                         NULL,
632                                                         &status);
633                     openCLVerifyCall(status);
634                     status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
635                     delete[] binary;
636                 }
637
638                 if(status != CL_SUCCESS)
639                 {
640                     if(status == CL_BUILD_PROGRAM_FAILURE)
641                     {
642                         cl_int logStatus;
643                         char *buildLog = NULL;
644                         size_t buildLogSize = 0;
645                         logStatus = clGetProgramBuildInfo(program,
646                                                           clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
647                                                           buildLog, &buildLogSize);
648                         if(logStatus != CL_SUCCESS)
649                             cout << "Failed to build the program and get the build info." << endl;
650                         buildLog = new char[buildLogSize];
651                         CV_DbgAssert(!!buildLog);
652                         memset(buildLog, 0, buildLogSize);
653                         openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
654                                                              CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
655                         cout << "\n\t\t\tBUILD LOG\n";
656                         cout << buildLog << endl;
657                         delete [] buildLog;
658                     }
659                     openCLVerifyCall(status);
660                 }
661                 //Cache the binary for future use if build_options is null
662                 if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE)
663                     programCache->addProgram(srcsign, program);
664                 else
665                     cout << "Warning: code cache has been full.\n";
666             }
667             kernel = clCreateKernel(program, kernelName.c_str(), &status);
668             openCLVerifyCall(status);
669             return kernel;
670         }
671
672         void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
673         {
674             size_t kernelWorkGroupSize;
675             openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
676                                                     CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
677             CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
678             CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
679             CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
680             CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
681             CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
682         }
683
684 #ifdef PRINT_KERNEL_RUN_TIME
685         static double total_execute_time = 0;
686         static double total_kernel_time = 0;
687 #endif
688         void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
689                                   size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
690                                   int depth, const char *build_options)
691         {
692             //construct kernel name
693             //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
694             //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
695             stringstream idxStr;
696             if(channels != -1)
697                 idxStr << "_C" << channels;
698             if(depth != -1)
699                 idxStr << "_D" << depth;
700             kernelName += idxStr.str();
701
702             cl_kernel kernel;
703             kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
704
705             if ( localThreads != NULL)
706             {
707                 globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
708                 globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
709                 globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
710
711                 //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
712                 cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
713             }
714             for(size_t i = 0; i < args.size(); i ++)
715                 openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
716
717 #ifndef PRINT_KERNEL_RUN_TIME
718             openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
719                                                   localThreads, 0, NULL, NULL));
720 #else
721             cl_event event = NULL;
722             openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
723                                                   localThreads, 0, NULL, &event));
724
725             cl_ulong start_time, end_time, queue_time;
726             double execute_time = 0;
727             double total_time   = 0;
728
729             openCLSafeCall(clWaitForEvents(1, &event));
730             openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
731                                                    sizeof(cl_ulong), &start_time, 0));
732
733             openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
734                                                    sizeof(cl_ulong), &end_time, 0));
735
736             openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
737                                                    sizeof(cl_ulong), &queue_time, 0));
738
739             execute_time = (double)(end_time - start_time) / (1000 * 1000);
740             total_time = (double)(end_time - queue_time) / (1000 * 1000);
741
742             //  cout << setiosflags(ios::left) << setw(15) << execute_time;
743             //  cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
744             //  cout << setiosflags(ios::left) << setw(15) << total_time << endl;
745
746             total_execute_time += execute_time;
747             total_kernel_time += total_time;
748             clReleaseEvent(event);
749 #endif
750
751             clFlush(clCxt->impl->clCmdQueue);
752             openCLSafeCall(clReleaseKernel(kernel));
753         }
754
755         void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
756                                  size_t globalThreads[3], size_t localThreads[3],
757                                  vector< pair<size_t, const void *> > &args, int channels, int depth)
758         {
759             openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args,
760                                 channels, depth, NULL);
761         }
762         void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
763                                  size_t globalThreads[3], size_t localThreads[3],
764                                  vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
765
766         {
767 #ifndef PRINT_KERNEL_RUN_TIME
768             openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
769                                  build_options);
770 #else
771             string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
772             cout << endl;
773             cout << "Function Name: " << kernelName;
774             if(depth >= 0)
775                 cout << " |data type: " << data_type[depth];
776             cout << " |channels: " << channels;
777             cout << " |Time Unit: " << "ms" << endl;
778
779             total_execute_time = 0;
780             total_kernel_time = 0;
781             cout << "-------------------------------------" << endl;
782
783             cout << setiosflags(ios::left) << setw(15) << "excute time";
784             cout << setiosflags(ios::left) << setw(15) << "lauch time";
785             cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
786             int i = 0;
787             for(i = 0; i < RUN_TIMES; i++)
788                 openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
789                                      build_options);
790
791             cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
792             cout << "average kernel total time:  " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
793 #endif
794         }
795
796        double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
797                                  size_t globalThreads[3], size_t localThreads[3],
798                                  vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
799                                  bool finish, bool measureKernelTime, bool cleanUp)
800
801         {
802             //construct kernel name
803             //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
804             //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
805             stringstream idxStr;
806             if(channels != -1)
807                 idxStr << "_C" << channels;
808             if(depth != -1)
809                 idxStr << "_D" << depth;
810             kernelName += idxStr.str();
811
812             cl_kernel kernel;
813             kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
814
815             double kernelTime = 0.0;
816
817             if( globalThreads != NULL)
818             {
819                 if ( localThreads != NULL)
820                 {
821                     globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
822                     globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
823                     globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
824
825                     //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
826                     cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
827                 }
828                 for(size_t i = 0; i < args.size(); i ++)
829                     openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
830
831                 if(measureKernelTime == false)
832                 {
833                     openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
834                                     localThreads, 0, NULL, NULL));
835                 }
836                 else
837                 {
838                     cl_event event = NULL;
839                     openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
840                                     localThreads, 0, NULL, &event));
841
842                     cl_ulong end_time, queue_time;
843
844                     openCLSafeCall(clWaitForEvents(1, &event));
845
846                     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
847                                     sizeof(cl_ulong), &end_time, 0));
848
849                     openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
850                                     sizeof(cl_ulong), &queue_time, 0));
851
852                     kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
853
854                     clReleaseEvent(event);
855                 }
856             }
857
858             if(finish)
859             {
860                 clFinish(clCxt->impl->clCmdQueue);
861             }
862
863             if(cleanUp)
864             {
865                 openCLSafeCall(clReleaseKernel(kernel));
866             }
867
868             return kernelTime;
869         }
870
871         // Converts the contents of a file into a string
872         static int convertToString(const char *filename, std::string& s)
873         {
874             size_t size;
875             char*  str;
876
877             std::fstream f(filename, (std::fstream::in | std::fstream::binary));
878             if(f.is_open())
879             {
880                 size_t fileSize;
881                 f.seekg(0, std::fstream::end);
882                 size = fileSize = (size_t)f.tellg();
883                 f.seekg(0, std::fstream::beg);
884
885                 str = new char[size+1];
886                 if(!str)
887                 {
888                     f.close();
889                     return -1;
890                 }
891
892                 f.read(str, fileSize);
893                 f.close();
894                 str[size] = '\0';
895
896                 s = str;
897                 delete[] str;
898                 return 0;
899             }
900             printf("Error: Failed to open file %s\n", filename);
901             return -1;
902         }
903
904         double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
905                                  size_t globalThreads[3], size_t localThreads[3],
906                                  vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
907                                  bool finish, bool measureKernelTime, bool cleanUp)
908
909         {
910             std::vector<std::string> fsource;
911             for (int i = 0 ; i < numFiles ; i++)
912             {
913                 std::string str;
914                 if (convertToString(fileName[i], str) >= 0)
915                     fsource.push_back(str);
916             }
917             const char **source = new const char *[numFiles];
918             for (int i = 0 ; i < numFiles ; i++)
919                 source[i] = fsource[i].c_str();
920             double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads,
921                                  args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
922             fsource.clear();
923             delete []source;
924             return kernelTime;
925         }
926
927         cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
928                              const size_t size)
929         {
930             int status;
931             cl_mem con_struct;
932
933             con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
934             openCLSafeCall(status);
935
936             openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
937                                                 value, 0, 0, 0));
938
939             return con_struct;
940
941         }
942
943         /////////////////////////////OpenCL initialization/////////////////
944         auto_ptr<Context> Context::clCxt;
945         int Context::val = 0;
946         static Mutex cs;
947         static volatile int context_tear_down = 0;
948
949         bool initialized()
950         {
951             return *((volatile int*)&Context::val) != 0 && 
952                 Context::clCxt->impl->clCmdQueue != NULL&& 
953                 Context::clCxt->impl->oclcontext != NULL;
954         }
955
956         Context* Context::getContext()
957         {
958             if(*((volatile int*)&val) != 1)
959             {
960                 AutoLock al(cs);
961                 if(*((volatile int*)&val) != 1)
962                 {
963                     if (context_tear_down)
964                         return clCxt.get();
965                     if( 0 == clCxt.get())
966                         clCxt.reset(new Context);
967                     std::vector<Info> oclinfo;
968                     CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
969
970                     *((volatile int*)&val) = 1;
971                 }
972             }
973             return clCxt.get();
974         }
975
976         void Context::setContext(Info &oclinfo)
977         {
978             AutoLock guard(cs);
979             if(*((volatile int*)&val) != 1)
980             {
981                 if( 0 == clCxt.get())
982                     clCxt.reset(new Context);
983
984                 clCxt.get()->impl = oclinfo.impl->copy();
985
986                 *((volatile int*)&val) = 1;
987             }
988             else
989             {
990                 clCxt.get()->impl->release();
991                 clCxt.get()->impl = oclinfo.impl->copy();
992             }
993         }
994
995         Context::Context()
996         {
997             impl = 0;
998             programCache = ProgramCache::getProgramCache();
999         }
1000
1001         Context::~Context()
1002         {
1003             release();
1004         }
1005
1006         void Context::release()
1007         {
1008             if (impl)
1009                 impl->release();
1010             programCache->releaseProgram();
1011         }
1012
1013         bool Context::supportsFeature(int ftype)
1014         {
1015             switch(ftype)
1016             {
1017             case CL_DOUBLE:
1018                 return impl->double_support == 1;
1019             case CL_UNIFIED_MEM:
1020                 return impl->unified_memory == 1;
1021             case CL_VER_1_2:
1022                 return impl->clVersion.find("OpenCL 1.2") != string::npos;
1023             default:
1024                 return false;
1025             }
1026         }
1027
1028         size_t Context::computeUnits()
1029         {
1030             return impl->maxComputeUnits;
1031         }
1032
1033         void* Context::oclContext()
1034         {
1035             return impl->oclcontext;
1036         }
1037
1038         void* Context::oclCommandQueue()
1039         {
1040             return impl->clCmdQueue;
1041         }
1042
1043         Info::Info()
1044         {
1045             impl = new Impl;
1046         }
1047
1048         void Info::release()
1049         {
1050             fft_teardown();
1051             impl->release();
1052             impl = new Impl;
1053             DeviceName.clear();
1054         }
1055
1056         Info::~Info()
1057         {
1058             fft_teardown();
1059             impl->release();
1060         }
1061
1062         Info &Info::operator = (const Info &m)
1063         {
1064             impl->release();
1065             impl = m.impl->copy();
1066             DeviceName = m.DeviceName;
1067             return *this;
1068         }
1069
1070         Info::Info(const Info &m)
1071         {
1072             impl = m.impl->copy();
1073             DeviceName = m.DeviceName;
1074         }
1075     }//namespace ocl
1076
1077 }//namespace cv
1078
1079 #if defined BUILD_SHARED_LIBS && defined CVAPI_EXPORTS && defined WIN32 && !defined WINCE
1080 #include <windows.h>
1081 BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID );
1082
1083 BOOL WINAPI DllMain( HINSTANCE, DWORD  fdwReason, LPVOID )
1084 {
1085     if( fdwReason == DLL_PROCESS_DETACH )
1086     {
1087         // application hangs if call clReleaseCommandQueue here, so release context only
1088         // without context release application hangs as well
1089         context_tear_down = 1;
1090         Context* cv_ctx = Context::getContext();
1091         if(cv_ctx)
1092         {
1093             cl_context ctx = cv_ctx->impl->oclcontext;
1094             if(ctx)
1095                 openCLSafeCall(clReleaseContext(ctx));
1096         }
1097     }
1098     return TRUE;
1099 }
1100 #endif