1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
19 // Guoping Long, longguoping@gmail.com
20 // Niko Li, newlife20080214@gmail.com
21 // Yao Wang, bitwangyaoyao@gmail.com
22 // Redistribution and use in source and binary forms, with or without modification,
23 // are permitted provided that the following conditions are met:
25 // * Redistribution's of source code must retain the above copyright notice,
26 // this list of conditions and the following disclaimer.
28 // * Redistribution's in binary form must reproduce the above copyright notice,
29 // this list of conditions and the following disclaimer in the documentation
30 // and/or other oclMaterials provided with the distribution.
32 // * The name of the copyright holders may not be used to endorse or promote products
33 // derived from this software without specific prior written permission.
35 // This software is provided by the copyright holders and contributors "as is" and
36 // any express or implied warranties, including, but not limited to, the implied
37 // warranties of merchantability and fitness for a particular purpose are disclaimed.
38 // In no event shall the Intel Corporation or contributors be liable for any direct,
39 // indirect, incidental, special, exemplary, or consequential damages
40 // (including, but not limited to, procurement of substitute goods or services;
41 // loss of use, data, or profits; or business interruption) however caused
42 // and on any theory of liability, whether in contract, strict liability,
43 // or tort (including negligence or otherwise) arising in any way out of
44 // the use of this software, even if advised of the possibility of such damage.
48 #include "precomp.hpp"
51 #include "binarycaching.hpp"
54 using namespace cv::ocl;
59 //#define PRINT_KERNEL_RUN_TIME
61 #ifndef CL_MEM_USE_PERSISTENT_MEM_AMD
62 #define CL_MEM_USE_PERSISTENT_MEM_AMD 0
64 //#define AMD_DOUBLE_DIFFER
70 extern void fft_teardown();
72 * The binary caching system to eliminate redundant program source compilation.
73 * Strictly, this is not a cache because we do not implement evictions right now.
74 * We shall add such features to trade-off memory consumption and performance when necessary.
76 auto_ptr<ProgramCache> ProgramCache::programCache;
77 ProgramCache *programCache = NULL;
78 DevMemType gDeviceMemType = DEVICE_MEM_DEFAULT;
79 DevMemRW gDeviceMemRW = DEVICE_MEM_R_W;
80 int gDevMemTypeValueMap[5] = {0,
81 CL_MEM_ALLOC_HOST_PTR,
84 CL_MEM_USE_PERSISTENT_MEM_AMD};
85 int gDevMemRWValueMap[3] = {CL_MEM_READ_WRITE, CL_MEM_READ_ONLY, CL_MEM_WRITE_ONLY};
87 ProgramCache::ProgramCache()
93 ProgramCache::~ProgramCache()
98 cl_program ProgramCache::progLookup(string srcsign)
100 map<string, cl_program>::iterator iter;
101 iter = codeCache.find(srcsign);
102 if(iter != codeCache.end())
108 void ProgramCache::addProgram(string srcsign , cl_program program)
110 if(!progLookup(srcsign))
112 codeCache.insert(map<string, cl_program>::value_type(srcsign, program));
116 void ProgramCache::releaseProgram()
118 map<string, cl_program>::iterator iter;
119 for(iter = codeCache.begin(); iter != codeCache.end(); iter++)
121 openCLSafeCall(clReleaseProgram(iter->second));
127 // not to be exported to dynamic lib
128 void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl);
131 cl_platform_id oclplatform;
132 std::vector<cl_device_id> devices;
133 std::vector<std::string> devName;
134 std::string clVersion;
136 cl_context oclcontext;
137 cl_command_queue clCmdQueue;
139 size_t maxWorkGroupSize;
140 cl_uint maxDimensions; // == maxWorkItemSizes.size()
141 std::vector<size_t> maxWorkItemSizes;
142 cl_uint maxComputeUnits;
143 char extra_options[512];
145 int unified_memory; //1 means integrated GPU, otherwise this value is 0
146 bool enable_disk_cache;
147 bool update_disk_cache;
153 void setDevice(void *ctx, void *q, int devnum);
157 if(1 == CV_XADD(&refcounter, -1))
166 CV_XADD(&refcounter, 1);
172 Impl& operator=(const Impl&);
173 void releaseResources();
186 enable_disk_cache(false),
187 update_disk_cache(false),
191 memset(extra_options, 0, 512);
192 setBinaryDiskCacheImpl(CACHE_RELEASE, String("./"), this);
195 void Info::Impl::releaseResources()
201 openCLSafeCall(clReleaseCommandQueue(clCmdQueue));
207 openCLSafeCall(clReleaseContext(oclcontext));
212 void Info::Impl::setDevice(void *ctx, void *q, int dnum)
214 if((ctx && q) || devnum != dnum)
217 CV_Assert(dnum >= 0 && dnum < (int)devices.size());
221 oclcontext = (cl_context)ctx;
222 clCmdQueue = (cl_command_queue)q;
223 clRetainContext(oclcontext);
224 clRetainCommandQueue(clCmdQueue);
229 cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(oclplatform), 0 };
230 oclcontext = clCreateContext(cps, 1, &devices[devnum], 0, 0, &status);
231 openCLVerifyCall(status);
232 clCmdQueue = clCreateCommandQueue(oclcontext, devices[devnum], CL_QUEUE_PROFILING_ENABLE, &status);
233 openCLVerifyCall(status);
236 openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&maxWorkGroupSize, 0));
237 openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void *)&maxDimensions, 0));
238 maxWorkItemSizes.resize(maxDimensions);
239 openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDimensions, (void *)&maxWorkItemSizes[0], 0));
240 openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void *)&maxComputeUnits, 0));
242 cl_bool unfymem = false;
243 openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool), (void *)&unfymem, 0));
244 unified_memory = unfymem ? 1 : 0;
246 //initialize extra options for compilation. Currently only fp64 is included.
247 //Assume 4KB is enough to store all possible extensions.
248 const int EXT_LEN = 4096 + 1 ;
249 char extends_set[EXT_LEN];
251 openCLSafeCall(clGetDeviceInfo(devices[devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
252 extends_set[EXT_LEN - 1] = 0;
253 size_t fp64_khr = std::string(extends_set).find("cl_khr_fp64");
255 if(fp64_khr != std::string::npos)
257 sprintf(extra_options, "-D DOUBLE_SUPPORT");
262 memset(extra_options, 0, 512);
267 ////////////////////////Common OpenCL specific calls///////////////
268 int getDevMemType(DevMemRW& rw_type, DevMemType& mem_type)
270 rw_type = gDeviceMemRW;
271 mem_type = gDeviceMemType;
272 return Context::getContext()->impl->unified_memory;
275 int setDevMemType(DevMemRW rw_type, DevMemType mem_type)
277 if( (mem_type == DEVICE_MEM_PM && Context::getContext()->impl->unified_memory == 0) ||
278 mem_type == DEVICE_MEM_UHP ||
279 mem_type == DEVICE_MEM_CHP )
281 gDeviceMemRW = rw_type;
282 gDeviceMemType = mem_type;
286 inline int divUp(int total, int grain)
288 return (total + grain - 1) / grain;
291 int getDevice(std::vector<Info> &oclinfo, int devicetype)
293 //TODO: cache oclinfo vector
298 case CVCL_DEVICE_TYPE_DEFAULT:
299 case CVCL_DEVICE_TYPE_CPU:
300 case CVCL_DEVICE_TYPE_GPU:
301 case CVCL_DEVICE_TYPE_ACCELERATOR:
302 case CVCL_DEVICE_TYPE_ALL:
309 cl_uint numPlatforms;
310 openCLSafeCall(clGetPlatformIDs(0, 0, &numPlatforms));
311 if(numPlatforms < 1) return 0;
313 std::vector<cl_platform_id> platforms(numPlatforms);
314 openCLSafeCall(clGetPlatformIDs(numPlatforms, &platforms[0], 0));
316 char deviceName[256];
319 for (unsigned i = 0; i < numPlatforms; ++i)
322 cl_int status = clGetDeviceIDs(platforms[i], devicetype, 0, NULL, &numsdev);
323 if(status != CL_DEVICE_NOT_FOUND)
324 openCLVerifyCall(status);
328 devcienums += numsdev;
329 std::vector<cl_device_id> devices(numsdev);
330 openCLSafeCall(clGetDeviceIDs(platforms[i], devicetype, numsdev, &devices[0], 0));
333 ocltmpinfo.impl->oclplatform = platforms[i];
334 openCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, sizeof(clVersion), clVersion, NULL));
335 ocltmpinfo.impl->clVersion = clVersion;
336 for(unsigned j = 0; j < numsdev; ++j)
338 ocltmpinfo.impl->devices.push_back(devices[j]);
339 openCLSafeCall(clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, 0));
340 ocltmpinfo.impl->devName.push_back(deviceName);
341 ocltmpinfo.DeviceName.push_back(deviceName);
343 oclinfo.push_back(ocltmpinfo);
348 setDevice(oclinfo[0]);
353 void setDevice(Info &oclinfo, int devnum)
355 oclinfo.impl->setDevice(0, 0, devnum);
356 Context::setContext(oclinfo);
359 void setDeviceEx(Info &oclinfo, void *ctx, void *q, int devnum)
361 oclinfo.impl->setDevice(ctx, q, devnum);
362 Context::setContext(oclinfo);
365 void *getoclContext()
367 return &(Context::getContext()->impl->oclcontext);
370 void *getoclCommandQueue()
372 return &(Context::getContext()->impl->clCmdQueue);
377 clFinish(Context::getContext()->impl->clCmdQueue);
380 //template specializations of queryDeviceInfo
382 bool queryDeviceInfo<IS_CPU_DEVICE, bool>(cl_kernel)
384 Info::Impl* impl = Context::getContext()->impl;
385 cl_device_type devicetype;
386 openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum],
387 CL_DEVICE_TYPE, sizeof(cl_device_type),
389 return (devicetype == CVCL_DEVICE_TYPE_CPU);
392 template<typename _ty>
393 static _ty queryWavesize(cl_kernel kernel)
396 Info::Impl* impl = Context::getContext()->impl;
397 bool is_cpu = queryDeviceInfo<IS_CPU_DEVICE, bool>();
402 CV_Assert(kernel != NULL);
403 openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
404 CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &info, NULL));
405 return static_cast<_ty>(info);
409 size_t queryDeviceInfo<WAVEFRONT_SIZE, size_t>(cl_kernel kernel)
411 return queryWavesize<size_t>(kernel);
414 int queryDeviceInfo<WAVEFRONT_SIZE, int>(cl_kernel kernel)
416 return queryWavesize<int>(kernel);
419 void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
422 status = clEnqueueReadBuffer(clCxt->impl->clCmdQueue, dst_buffer, CL_TRUE, 0,
423 size, host_buffer, 0, NULL, NULL);
424 openCLVerifyCall(status);
427 cl_mem openCLCreateBuffer(Context *clCxt, size_t flag , size_t size)
430 cl_mem buffer = clCreateBuffer(clCxt->impl->oclcontext, (cl_mem_flags)flag, size, NULL, &status);
431 openCLVerifyCall(status);
435 void openCLMallocPitch(Context *clCxt, void **dev_ptr, size_t *pitch,
436 size_t widthInBytes, size_t height)
438 openCLMallocPitchEx(clCxt, dev_ptr, pitch, widthInBytes, height, gDeviceMemRW, gDeviceMemType);
441 void openCLMallocPitchEx(Context *clCxt, void **dev_ptr, size_t *pitch,
442 size_t widthInBytes, size_t height, DevMemRW rw_type, DevMemType mem_type)
445 *dev_ptr = clCreateBuffer(clCxt->impl->oclcontext, gDevMemRWValueMap[rw_type]|gDevMemTypeValueMap[mem_type],
446 widthInBytes * height, 0, &status);
447 openCLVerifyCall(status);
448 *pitch = widthInBytes;
451 void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
452 const void *src, size_t spitch,
453 size_t width, size_t height, openCLMemcpyKind kind, int channels)
455 size_t buffer_origin[3] = {0, 0, 0};
456 size_t host_origin[3] = {0, 0, 0};
457 size_t region[3] = {width, height, 1};
458 if(kind == clMemcpyHostToDevice)
460 if(dpitch == width || channels == 3 || height == 1)
462 openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
463 0, width * height, src, 0, NULL, NULL));
467 openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
468 buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
471 else if(kind == clMemcpyDeviceToHost)
473 if(spitch == width || channels == 3 || height == 1)
475 openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
476 0, width * height, dst, 0, NULL, NULL));
480 openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
481 buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
486 void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
487 const void *src, size_t spitch,
488 size_t width, size_t height, int src_offset)
490 size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
491 size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
492 size_t region[3] = {width, height, 1};
494 openCLSafeCall(clEnqueueCopyBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, (cl_mem)dst, src_origin, dst_origin,
495 region, spitch, 0, dpitch, 0, 0, 0, 0));
498 void openCLFree(void *devPtr)
500 openCLSafeCall(clReleaseMemObject((cl_mem)devPtr));
502 cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName)
504 return openCLGetKernelFromSource(clCxt, source, kernelName, NULL);
507 void setBinaryDiskCacheImpl(int mode, String path, Info::Impl * impl)
509 impl->update_disk_cache = (mode & CACHE_UPDATE) == CACHE_UPDATE;
510 impl->enable_disk_cache =
512 (mode & CACHE_DEBUG) == CACHE_DEBUG;
514 (mode & CACHE_RELEASE) == CACHE_RELEASE;
516 if(impl->enable_disk_cache && !path.empty())
518 impl->binpath = path;
521 void setBinaryDiskCache(int mode, cv::String path)
523 setBinaryDiskCacheImpl(mode, path, Context::getContext()->impl);
526 void setBinpath(const char *path)
528 Context *clcxt = Context::getContext();
529 clcxt->impl->binpath = path;
532 int savetofile(const Context*, cl_program &program, const char *fileName)
535 openCLSafeCall(clGetProgramInfo(program,
536 CL_PROGRAM_BINARY_SIZES,
539 char* binary = (char*)malloc(binarySize);
542 CV_Error(CV_StsNoMem, "Failed to allocate host memory.");
544 openCLSafeCall(clGetProgramInfo(program,
550 FILE *fp = fopen(fileName, "wb+");
553 fwrite(binary, binarySize, 1, fp);
560 cl_kernel openCLGetKernelFromSource(const Context *clCxt, const char **source, string kernelName,
561 const char *build_options)
566 stringstream src_sign;
569 CV_Assert(programCache != NULL);
571 if(NULL != build_options)
573 src_sign << (int64)(*source) << clCxt->impl->oclcontext << "_" << build_options;
577 src_sign << (int64)(*source) << clCxt->impl->oclcontext;
579 srcsign = src_sign.str();
582 program = programCache->progLookup(srcsign);
586 //config build programs
587 char all_build_options[1024];
588 memset(all_build_options, 0, 1024);
589 char zeromem[512] = {0};
590 if(0 != memcmp(clCxt -> impl->extra_options, zeromem, 512))
591 strcat(all_build_options, clCxt -> impl->extra_options);
592 strcat(all_build_options, " ");
593 if(build_options != NULL)
594 strcat(all_build_options, build_options);
595 if(all_build_options != NULL)
597 filename = clCxt->impl->binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + all_build_options + ".clb";
601 filename = clCxt->impl->binpath + kernelName + "_" + clCxt->impl->devName[clCxt->impl->devnum] + ".clb";
604 FILE *fp = clCxt->impl->enable_disk_cache ? fopen(filename.c_str(), "rb") : NULL;
605 if(fp == NULL || clCxt->impl->update_disk_cache)
610 program = clCreateProgramWithSource(
611 clCxt->impl->oclcontext, 1, source, NULL, &status);
612 openCLVerifyCall(status);
613 status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
614 if(status == CL_SUCCESS && clCxt->impl->enable_disk_cache)
615 savetofile(clCxt, program, filename.c_str());
619 fseek(fp, 0, SEEK_END);
620 size_t binarySize = ftell(fp);
621 fseek(fp, 0, SEEK_SET);
622 char *binary = new char[binarySize];
623 CV_Assert(1 == fread(binary, binarySize, 1, fp));
626 program = clCreateProgramWithBinary(clCxt->impl->oclcontext,
628 &(clCxt->impl->devices[clCxt->impl->devnum]),
629 (const size_t *)&binarySize,
630 (const unsigned char **)&binary,
633 openCLVerifyCall(status);
634 status = clBuildProgram(program, 1, &(clCxt->impl->devices[clCxt->impl->devnum]), all_build_options, NULL, NULL);
638 if(status != CL_SUCCESS)
640 if(status == CL_BUILD_PROGRAM_FAILURE)
643 char *buildLog = NULL;
644 size_t buildLogSize = 0;
645 logStatus = clGetProgramBuildInfo(program,
646 clCxt->impl->devices[clCxt->impl->devnum], CL_PROGRAM_BUILD_LOG, buildLogSize,
647 buildLog, &buildLogSize);
648 if(logStatus != CL_SUCCESS)
649 cout << "Failed to build the program and get the build info." << endl;
650 buildLog = new char[buildLogSize];
651 CV_DbgAssert(!!buildLog);
652 memset(buildLog, 0, buildLogSize);
653 openCLSafeCall(clGetProgramBuildInfo(program, clCxt->impl->devices[clCxt->impl->devnum],
654 CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL));
655 cout << "\n\t\t\tBUILD LOG\n";
656 cout << buildLog << endl;
659 openCLVerifyCall(status);
661 //Cache the binary for future use if build_options is null
662 if( (programCache->cacheSize += 1) < programCache->MAX_PROG_CACHE_SIZE)
663 programCache->addProgram(srcsign, program);
665 cout << "Warning: code cache has been full.\n";
667 kernel = clCreateKernel(program, kernelName.c_str(), &status);
668 openCLVerifyCall(status);
672 void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
674 size_t kernelWorkGroupSize;
675 openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[clCxt->impl->devnum],
676 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0));
677 CV_Assert( localThreads[0] <= clCxt->impl->maxWorkItemSizes[0] );
678 CV_Assert( localThreads[1] <= clCxt->impl->maxWorkItemSizes[1] );
679 CV_Assert( localThreads[2] <= clCxt->impl->maxWorkItemSizes[2] );
680 CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= kernelWorkGroupSize );
681 CV_Assert( localThreads[0] * localThreads[1] * localThreads[2] <= clCxt->impl->maxWorkGroupSize );
684 #ifdef PRINT_KERNEL_RUN_TIME
685 static double total_execute_time = 0;
686 static double total_kernel_time = 0;
688 void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
689 size_t localThreads[3], vector< pair<size_t, const void *> > &args, int channels,
690 int depth, const char *build_options)
692 //construct kernel name
693 //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
694 //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
697 idxStr << "_C" << channels;
699 idxStr << "_D" << depth;
700 kernelName += idxStr.str();
703 kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
705 if ( localThreads != NULL)
707 globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
708 globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
709 globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
711 //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
712 cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
714 for(size_t i = 0; i < args.size(); i ++)
715 openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
717 #ifndef PRINT_KERNEL_RUN_TIME
718 openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
719 localThreads, 0, NULL, NULL));
721 cl_event event = NULL;
722 openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
723 localThreads, 0, NULL, &event));
725 cl_ulong start_time, end_time, queue_time;
726 double execute_time = 0;
727 double total_time = 0;
729 openCLSafeCall(clWaitForEvents(1, &event));
730 openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
731 sizeof(cl_ulong), &start_time, 0));
733 openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
734 sizeof(cl_ulong), &end_time, 0));
736 openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
737 sizeof(cl_ulong), &queue_time, 0));
739 execute_time = (double)(end_time - start_time) / (1000 * 1000);
740 total_time = (double)(end_time - queue_time) / (1000 * 1000);
742 // cout << setiosflags(ios::left) << setw(15) << execute_time;
743 // cout << setiosflags(ios::left) << setw(15) << total_time - execute_time;
744 // cout << setiosflags(ios::left) << setw(15) << total_time << endl;
746 total_execute_time += execute_time;
747 total_kernel_time += total_time;
748 clReleaseEvent(event);
751 clFlush(clCxt->impl->clCmdQueue);
752 openCLSafeCall(clReleaseKernel(kernel));
755 void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
756 size_t globalThreads[3], size_t localThreads[3],
757 vector< pair<size_t, const void *> > &args, int channels, int depth)
759 openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args,
760 channels, depth, NULL);
762 void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName,
763 size_t globalThreads[3], size_t localThreads[3],
764 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options)
767 #ifndef PRINT_KERNEL_RUN_TIME
768 openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
771 string data_type[] = { "uchar", "char", "ushort", "short", "int", "float", "double"};
773 cout << "Function Name: " << kernelName;
775 cout << " |data type: " << data_type[depth];
776 cout << " |channels: " << channels;
777 cout << " |Time Unit: " << "ms" << endl;
779 total_execute_time = 0;
780 total_kernel_time = 0;
781 cout << "-------------------------------------" << endl;
783 cout << setiosflags(ios::left) << setw(15) << "excute time";
784 cout << setiosflags(ios::left) << setw(15) << "lauch time";
785 cout << setiosflags(ios::left) << setw(15) << "kernel time" << endl;
787 for(i = 0; i < RUN_TIMES; i++)
788 openCLExecuteKernel_(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
791 cout << "average kernel excute time: " << total_execute_time / RUN_TIMES << endl; // "ms" << endl;
792 cout << "average kernel total time: " << total_kernel_time / RUN_TIMES << endl; // "ms" << endl;
796 double openCLExecuteKernelInterop(Context *clCxt , const char **source, string kernelName,
797 size_t globalThreads[3], size_t localThreads[3],
798 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
799 bool finish, bool measureKernelTime, bool cleanUp)
802 //construct kernel name
803 //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
804 //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
807 idxStr << "_C" << channels;
809 idxStr << "_D" << depth;
810 kernelName += idxStr.str();
813 kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
815 double kernelTime = 0.0;
817 if( globalThreads != NULL)
819 if ( localThreads != NULL)
821 globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
822 globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
823 globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
825 //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
826 cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
828 for(size_t i = 0; i < args.size(); i ++)
829 openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
831 if(measureKernelTime == false)
833 openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
834 localThreads, 0, NULL, NULL));
838 cl_event event = NULL;
839 openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
840 localThreads, 0, NULL, &event));
842 cl_ulong end_time, queue_time;
844 openCLSafeCall(clWaitForEvents(1, &event));
846 openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
847 sizeof(cl_ulong), &end_time, 0));
849 openCLSafeCall(clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED,
850 sizeof(cl_ulong), &queue_time, 0));
852 kernelTime = (double)(end_time - queue_time) / (1000 * 1000);
854 clReleaseEvent(event);
860 clFinish(clCxt->impl->clCmdQueue);
865 openCLSafeCall(clReleaseKernel(kernel));
871 // Converts the contents of a file into a string
872 static int convertToString(const char *filename, std::string& s)
877 std::fstream f(filename, (std::fstream::in | std::fstream::binary));
881 f.seekg(0, std::fstream::end);
882 size = fileSize = (size_t)f.tellg();
883 f.seekg(0, std::fstream::beg);
885 str = new char[size+1];
892 f.read(str, fileSize);
900 printf("Error: Failed to open file %s\n", filename);
904 double openCLExecuteKernelInterop(Context *clCxt , const char **fileName, const int numFiles, string kernelName,
905 size_t globalThreads[3], size_t localThreads[3],
906 vector< pair<size_t, const void *> > &args, int channels, int depth, const char *build_options,
907 bool finish, bool measureKernelTime, bool cleanUp)
910 std::vector<std::string> fsource;
911 for (int i = 0 ; i < numFiles ; i++)
914 if (convertToString(fileName[i], str) >= 0)
915 fsource.push_back(str);
917 const char **source = new const char *[numFiles];
918 for (int i = 0 ; i < numFiles ; i++)
919 source[i] = fsource[i].c_str();
920 double kernelTime = openCLExecuteKernelInterop(clCxt ,source, kernelName, globalThreads, localThreads,
921 args, channels, depth, build_options, finish, measureKernelTime, cleanUp);
927 cl_mem load_constant(cl_context context, cl_command_queue command_queue, const void *value,
933 con_struct = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &status);
934 openCLSafeCall(status);
936 openCLSafeCall(clEnqueueWriteBuffer(command_queue, con_struct, 1, 0, size,
943 /////////////////////////////OpenCL initialization/////////////////
944 auto_ptr<Context> Context::clCxt;
945 int Context::val = 0;
947 static volatile int context_tear_down = 0;
951 return *((volatile int*)&Context::val) != 0 &&
952 Context::clCxt->impl->clCmdQueue != NULL&&
953 Context::clCxt->impl->oclcontext != NULL;
956 Context* Context::getContext()
958 if(*((volatile int*)&val) != 1)
961 if(*((volatile int*)&val) != 1)
963 if (context_tear_down)
965 if( 0 == clCxt.get())
966 clCxt.reset(new Context);
967 std::vector<Info> oclinfo;
968 CV_Assert(getDevice(oclinfo, CVCL_DEVICE_TYPE_ALL) > 0);
970 *((volatile int*)&val) = 1;
976 void Context::setContext(Info &oclinfo)
979 if(*((volatile int*)&val) != 1)
981 if( 0 == clCxt.get())
982 clCxt.reset(new Context);
984 clCxt.get()->impl = oclinfo.impl->copy();
986 *((volatile int*)&val) = 1;
990 clCxt.get()->impl->release();
991 clCxt.get()->impl = oclinfo.impl->copy();
998 programCache = ProgramCache::getProgramCache();
1006 void Context::release()
1010 programCache->releaseProgram();
1013 bool Context::supportsFeature(int ftype)
1018 return impl->double_support == 1;
1019 case CL_UNIFIED_MEM:
1020 return impl->unified_memory == 1;
1022 return impl->clVersion.find("OpenCL 1.2") != string::npos;
1028 size_t Context::computeUnits()
1030 return impl->maxComputeUnits;
1033 void* Context::oclContext()
1035 return impl->oclcontext;
1038 void* Context::oclCommandQueue()
1040 return impl->clCmdQueue;
1048 void Info::release()
1062 Info &Info::operator = (const Info &m)
1065 impl = m.impl->copy();
1066 DeviceName = m.DeviceName;
1070 Info::Info(const Info &m)
1072 impl = m.impl->copy();
1073 DeviceName = m.DeviceName;
1079 #if defined BUILD_SHARED_LIBS && defined CVAPI_EXPORTS && defined WIN32 && !defined WINCE
1080 #include <windows.h>
1081 BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID );
1083 BOOL WINAPI DllMain( HINSTANCE, DWORD fdwReason, LPVOID )
1085 if( fdwReason == DLL_PROCESS_DETACH )
1087 // application hangs if call clReleaseCommandQueue here, so release context only
1088 // without context release application hangs as well
1089 context_tear_down = 1;
1090 Context* cv_ctx = Context::getContext();
1093 cl_context ctx = cv_ctx->impl->oclcontext;
1095 openCLSafeCall(clReleaseContext(ctx));