1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
19 // * Redistribution's of source code must retain the above copyright notice,
20 // this list of conditions and the following disclaimer.
22 // * Redistribution's in binary form must reproduce the above copyright notice,
23 // this list of conditions and the following disclaimer in the documentation
24 // and/or other materials provided with the distribution.
26 // * The name of the copyright holders may not be used to endorse or promote products
27 // derived from this software without specific prior written permission.
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
42 #include "precomp.hpp"
45 #include "ocl_disabled.impl.hpp"
54 #include <iostream> // std::cerr
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
60 #include <opencv2/core/utils/configuration.private.hpp>
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
80 #define CV_OPENCL_SHOW_RUN_KERNELS 0
81 #define CV_OPENCL_TRACE_CHECK 0
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG 1
86 #define CV_OPENCL_SHOW_SVM_LOG 0
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
91 # define LOG_BUFFER_POOL printf
93 # define LOG_BUFFER_POOL(...)
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
101 #define CV_OPENCL_SVM_TRACE_P(...)
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
117 #include "directx.hpp"
120 #ifdef HAVE_OPENCL_SVM
121 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
122 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
123 #include "opencv2/core/opencl/opencl_svm.hpp"
126 #include "umatrix.hpp"
128 namespace cv { namespace ocl {
130 #define IMPLEMENT_REFCOUNTABLE() \
131 void addref() { CV_XADD(&refcount, 1); } \
132 void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
135 static cv::utils::AllocatorStatistics opencl_allocator_stats;
137 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
138 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
140 return opencl_allocator_stats;
144 static bool isRaiseError()
146 static bool initialized = false;
147 static bool value = false;
150 value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
157 #if CV_OPENCL_TRACE_CHECK
159 void traceOpenCLCheck(cl_int status, const char* message)
161 std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
163 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
165 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
168 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
169 cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
171 #define CV_OCL_CHECK_RESULT(check_result, msg) \
173 CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
174 if (check_result != CL_SUCCESS) \
176 static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
177 cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
178 CV_Error(Error::OpenCLApiCallError, error_msg); \
182 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
184 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
188 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
189 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
191 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
193 CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
194 if (check_result != CL_SUCCESS && isRaiseError()) \
196 static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
197 cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
198 CV_Error(Error::OpenCLApiCallError, error_msg); \
201 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
202 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
206 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
207 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
208 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
209 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
211 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
212 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
215 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
216 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
224 static const String getBuildExtraOptions()
226 static String param_buildExtraOptions;
227 static bool initialized = false;
230 param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
232 if (!param_buildExtraOptions.empty())
233 CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
235 return param_buildExtraOptions;
238 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
239 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
244 UMat2D(const UMat& m)
246 offset = (int)m.offset;
259 UMat3D(const UMat& m)
261 offset = (int)m.offset;
262 step = (int)m.step.p[1];
263 slicestep = (int)m.step.p[0];
264 slices = (int)m.size.p[0];
276 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
277 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
279 static uint64 table[256];
280 static bool initialized = false;
284 for( int i = 0; i < 256; i++ )
287 for( int j = 0; j < 8; j++ )
288 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
295 for( size_t idx = 0; idx < size; idx++ )
296 crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
301 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
302 struct OpenCLBinaryCacheConfigurator
304 cv::String cache_path_;
305 cv::String cache_lock_filename_;
306 cv::Ptr<utils::fs::FileLock> cache_lock_;
308 typedef std::map<std::string, std::string> ContextCacheType;
309 ContextCacheType prepared_contexts_;
310 Mutex mutex_prepared_contexts_;
312 OpenCLBinaryCacheConfigurator()
314 CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
315 if (!CV_OPENCL_CACHE_ENABLE)
317 CV_LOG_INFO(NULL, "OpenCL cache is disabled");
320 cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
321 if (cache_path_.empty())
323 CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
329 if (cache_path_.empty())
331 if (cache_path_ == "disabled")
333 if (!utils::fs::createDirectories(cache_path_))
335 CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
340 if (CV_OPENCL_CACHE_LOCK_ENABLE)
342 cache_lock_filename_ = cache_path_ + ".lock";
343 if (!utils::fs::exists(cache_lock_filename_))
345 CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
346 std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
347 if (!lock_filename.is_open())
349 CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
356 cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
357 CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
359 utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
361 CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
363 catch (const cv::Exception& e)
365 CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
369 CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
374 if (CV_OPENCL_CACHE_WRITE)
376 CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
377 "(not safe for multiprocess environment)");
381 CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
385 catch (const cv::Exception& e)
387 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
391 if (!cache_path_.empty())
393 if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
395 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
396 "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
400 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
408 cache_lock_filename_.clear();
409 cache_lock_.release();
412 std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
413 const std::string& cleanup_prefix)
415 if (cache_path_.empty())
416 return std::string();
418 AutoLock lock(mutex_prepared_contexts_);
420 ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
421 if (found_it != prepared_contexts_.end())
422 return found_it->second;
424 CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
426 std::string target_directory = cache_path_ + ctx_prefix + "/";
427 bool result = utils::fs::isDirectory(target_directory);
432 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
433 if (utils::fs::createDirectories(target_directory))
439 CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
442 catch (const cv::Exception& e)
444 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
447 target_directory = result ? target_directory : std::string();
448 prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
450 if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
454 std::vector<String> entries;
455 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
456 std::vector<String> remove_entries;
457 for (size_t i = 0; i < entries.size(); i++)
459 const String& name = entries[i];
460 if (0 == name.find(cleanup_prefix))
462 if (0 == name.find(ctx_prefix))
463 continue; // skip current
464 remove_entries.push_back(name);
467 if (!remove_entries.empty())
469 CV_LOG_WARNING(NULL, (remove_entries.size() == 1
470 ? "Detected OpenCL cache directory for other version of OpenCL device."
471 : "Detected OpenCL cache directories for other versions of OpenCL device.")
472 << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
473 CV_LOG_WARNING(NULL, "Trying to remove these directories...");
474 for (size_t i = 0; i < remove_entries.size(); i++)
476 CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
478 CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
480 for (size_t i = 0; i < remove_entries.size(); i++)
482 const String& name = remove_entries[i];
483 cv::String path = utils::fs::join(cache_path_, name);
486 utils::fs::remove_all(path);
487 CV_LOG_WARNING(NULL, "Removed: " << path);
489 catch (const cv::Exception& e)
491 CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
498 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
502 CV_LOG_VERBOSE(NULL, 1, " Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
503 return target_directory;
506 static OpenCLBinaryCacheConfigurator& getSingletonInstance()
508 CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
511 class BinaryProgramFile
513 enum { MAX_ENTRIES = 64 };
515 typedef unsigned int uint32_t;
517 struct CV_DECL_ALIGNED(4) FileHeader
519 uint32_t sourceSignatureSize;
520 //char sourceSignature[];
523 struct CV_DECL_ALIGNED(4) FileTable
525 uint32_t numberOfEntries;
526 //uint32_t firstEntryOffset[];
529 struct CV_DECL_ALIGNED(4) FileEntry
531 uint32_t nextEntryFileOffset; // 0 for the last entry in chain
538 const std::string fileName_;
539 const char* const sourceSignature_;
540 const size_t sourceSignatureSize_;
544 uint32_t entryOffsets[MAX_ENTRIES];
546 uint32_t getHash(const std::string& options)
548 uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
549 return hash & (MAX_ENTRIES - 1);
552 inline size_t getFileSize()
554 size_t pos = (size_t)f.tellg();
555 f.seekg(0, std::fstream::end);
556 size_t fileSize = (size_t)f.tellg();
557 f.seekg(pos, std::fstream::beg);
560 inline uint32_t readUInt32()
563 f.read((char*)&res, sizeof(uint32_t));
564 CV_Assert(!f.fail());
567 inline void writeUInt32(const uint32_t value)
570 f.write((char*)&v, sizeof(uint32_t));
571 CV_Assert(!f.fail());
574 inline void seekReadAbsolute(size_t pos)
576 f.seekg(pos, std::fstream::beg);
577 CV_Assert(!f.fail());
579 inline void seekReadRelative(size_t pos)
581 f.seekg(pos, std::fstream::cur);
582 CV_Assert(!f.fail());
585 inline void seekWriteAbsolute(size_t pos)
587 f.seekp(pos, std::fstream::beg);
588 CV_Assert(!f.fail());
594 if (0 != remove(fileName_.c_str()))
595 CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
600 BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
601 : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
603 CV_StaticAssert(sizeof(uint32_t) == 4, "");
604 CV_Assert(sourceSignature_ != NULL);
605 CV_Assert(sourceSignatureSize_ > 0);
606 memset(entryOffsets, 0, sizeof(entryOffsets));
608 f.rdbuf()->pubsetbuf(0, 0); // disable buffering
609 f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
610 if(f.is_open() && getFileSize() > 0)
612 bool isValid = false;
615 uint32_t fileSourceSignatureSize = readUInt32();
616 if (fileSourceSignatureSize == sourceSignatureSize_)
618 cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
619 f.read(fileSourceSignature.data(), fileSourceSignatureSize);
622 CV_LOG_ERROR(NULL, "Unexpected EOF");
624 else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
631 CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
634 catch (const cv::Exception& e)
636 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
640 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
653 bool read(const std::string& key, std::vector<char>& buf)
658 size_t fileSize = getFileSize();
661 CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
668 uint32_t fileSourceSignatureSize = readUInt32();
669 CV_Assert(fileSourceSignatureSize > 0);
670 seekReadRelative(fileSourceSignatureSize);
672 uint32_t numberOfEntries = readUInt32();
673 CV_Assert(numberOfEntries > 0);
674 if (numberOfEntries != MAX_ENTRIES)
676 CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
680 f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
681 CV_Assert(!f.fail());
683 uint32_t entryNum = getHash(key);
685 uint32_t entryOffset = entryOffsets[entryNum];
687 while (entryOffset > 0)
689 seekReadAbsolute(entryOffset);
690 //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
691 f.read((char*)&entry, sizeof(entry));
692 CV_Assert(!f.fail());
693 cv::AutoBuffer<char> fileKey(entry.keySize + 1);
694 if (key.size() == entry.keySize)
696 if (entry.keySize > 0)
698 f.read(fileKey.data(), entry.keySize);
699 CV_Assert(!f.fail());
701 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
703 buf.resize(entry.dataSize);
704 f.read(&buf[0], entry.dataSize);
705 CV_Assert(!f.fail());
707 CV_LOG_VERBOSE(NULL, 0, "Read...");
711 if (entry.nextEntryFileOffset == 0)
713 entryOffset = entry.nextEntryFileOffset;
718 bool write(const std::string& key, std::vector<char>& buf)
722 f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
725 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
728 CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
734 size_t fileSize = getFileSize();
738 seekWriteAbsolute(0);
739 writeUInt32((uint32_t)sourceSignatureSize_);
740 f.write(sourceSignature_, sourceSignatureSize_);
741 CV_Assert(!f.fail());
743 writeUInt32(MAX_ENTRIES);
744 memset(entryOffsets, 0, sizeof(entryOffsets));
745 f.write((char*)entryOffsets, sizeof(entryOffsets));
746 CV_Assert(!f.fail());
748 CV_Assert(!f.fail());
750 f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
751 CV_Assert(f.is_open());
752 fileSize = getFileSize();
757 uint32_t fileSourceSignatureSize = readUInt32();
758 CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
759 seekReadRelative(fileSourceSignatureSize);
761 uint32_t numberOfEntries = readUInt32();
762 CV_Assert(numberOfEntries > 0);
763 if (numberOfEntries != MAX_ENTRIES)
765 CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
769 size_t tableEntriesOffset = (size_t)f.tellg();
770 f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
771 CV_Assert(!f.fail());
773 uint32_t entryNum = getHash(key);
775 uint32_t entryOffset = entryOffsets[entryNum];
777 while (entryOffset > 0)
779 seekReadAbsolute(entryOffset);
780 //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
781 f.read((char*)&entry, sizeof(entry));
782 CV_Assert(!f.fail());
783 cv::AutoBuffer<char> fileKey(entry.keySize + 1);
784 if (key.size() == entry.keySize)
786 if (entry.keySize > 0)
788 f.read(fileKey.data(), entry.keySize);
789 CV_Assert(!f.fail());
791 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
794 CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
798 if (entry.nextEntryFileOffset == 0)
800 entryOffset = entry.nextEntryFileOffset;
805 seekWriteAbsolute(entryOffset);
806 entry.nextEntryFileOffset = (uint32_t)fileSize;
807 f.write((char*)&entry, sizeof(entry));
808 CV_Assert(!f.fail());
812 entryOffsets[entryNum] = (uint32_t)fileSize;
813 seekWriteAbsolute(tableEntriesOffset);
814 f.write((char*)entryOffsets, sizeof(entryOffsets));
815 CV_Assert(!f.fail());
817 seekWriteAbsolute(fileSize);
818 entry.nextEntryFileOffset = 0;
819 entry.dataSize = (uint32_t)buf.size();
820 entry.keySize = (uint32_t)key.size();
821 f.write((char*)&entry, sizeof(entry));
822 CV_Assert(!f.fail());
823 f.write(key.c_str(), entry.keySize);
824 CV_Assert(!f.fail());
825 f.write(&buf[0], entry.dataSize);
826 CV_Assert(!f.fail());
828 CV_Assert(!f.fail());
829 CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
833 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
837 struct OpenCLExecutionContext::Impl
839 ocl::Context context_;
840 int device_; // device index in context
847 void _init_device(cl_device_id deviceID)
850 int ndevices = (int)context_.ndevices();
851 CV_Assert(ndevices > 0);
853 for (int i = 0; i < ndevices; i++)
855 ocl::Device d = context_.device(i);
856 cl_device_id dhandle = (cl_device_id)d.ptr();
857 if (dhandle == deviceID)
864 CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
867 void _init_device(const ocl::Device& device)
869 CV_Assert(device.ptr());
870 int ndevices = (int)context_.ndevices();
871 CV_Assert(ndevices > 0);
873 for (int i = 0; i < ndevices; i++)
875 ocl::Device d = context_.device(i);
876 if (d.getImpl() == device.getImpl())
883 CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
887 Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
888 : device_(0), useOpenCL_(-1)
890 CV_UNUSED(platformID);
894 context_ = Context::fromHandle(context);
895 _init_device(deviceID);
896 queue_ = Queue(context_, context_.device(device_));
899 Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
900 : device_(0), useOpenCL_(-1)
902 CV_Assert(context.ptr());
903 CV_Assert(device.ptr());
906 _init_device(device);
910 Impl(const ocl::Context& context, const ocl::Device& device)
911 : device_(0), useOpenCL_(-1)
913 CV_Assert(context.ptr());
914 CV_Assert(device.ptr());
917 _init_device(device);
918 queue_ = Queue(context_, context_.device(device_));
921 Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
929 Impl(const Impl& other)
930 : context_(other.context_)
931 , device_(other.device_)
932 , queue_(other.queue_)
938 inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
946 if (!context_.empty() && context_.ndevices() > 0)
948 const Device& d = context_.device(device_);
949 useOpenCL_ = d.available();
952 catch (const cv::Exception&)
957 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
959 return useOpenCL_ > 0;
962 void setUseOpenCL(bool flag)
970 static const std::shared_ptr<Impl>& getInitializedExecutionContext()
974 CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
976 static bool initialized = false;
977 static std::shared_ptr<Impl> g_primaryExecutionContext;
981 cv::AutoLock lock(getInitializationMutex());
984 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
987 Context c = ocl::Context::create(std::string());
991 auto& d = c.device(deviceId);
994 auto q = ocl::Queue(c, d);
997 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
1001 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
1002 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
1007 CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1012 CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1015 catch (const std::exception& e)
1017 CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1021 CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1026 return g_primaryExecutionContext;
1030 Context& OpenCLExecutionContext::getContext() const
1035 Device& OpenCLExecutionContext::getDevice() const
1038 return p->context_.device(p->device_);
1040 Queue& OpenCLExecutionContext::getQueue() const
1046 bool OpenCLExecutionContext::useOpenCL() const
1049 return p->useOpenCL();
1052 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1055 p->setUseOpenCL(flag);
1059 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1061 CV_TRACE_FUNCTION();
1062 CoreTLSData& data = getCoreTlsData();
1063 OpenCLExecutionContext& c = data.oclExecutionContext;
1064 if (!data.oclExecutionContextInitialized)
1066 data.oclExecutionContextInitialized = true;
1067 if (c.empty() && haveOpenCL())
1068 c.p = Impl::getInitializedExecutionContext();
1074 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1076 CV_TRACE_FUNCTION();
1077 CoreTLSData& data = getCoreTlsData();
1078 OpenCLExecutionContext& c = data.oclExecutionContext;
1082 void OpenCLExecutionContext::bind() const
1084 CV_TRACE_FUNCTION();
1086 CoreTLSData& data = getCoreTlsData();
1087 data.oclExecutionContext = *this;
1088 data.oclExecutionContextInitialized = true;
1089 data.useOpenCL = p->useOpenCL_; // propagate "-1", avoid call useOpenCL()
1093 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1095 CV_TRACE_FUNCTION();
1097 const Queue q(getContext(), getDevice());
1098 return cloneWithNewQueue(q);
1101 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1103 CV_TRACE_FUNCTION();
1105 CV_Assert(q.ptr() != NULL);
1106 OpenCLExecutionContext c;
1107 c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1112 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1114 CV_TRACE_FUNCTION();
1116 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1118 CV_Assert(!context.empty());
1119 CV_Assert(context.ptr());
1120 CV_Assert(!device.empty());
1121 CV_Assert(device.ptr());
1122 OpenCLExecutionContext ctx;
1123 ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1129 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1131 CV_TRACE_FUNCTION();
1133 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1135 CV_Assert(!context.empty());
1136 CV_Assert(context.ptr());
1137 CV_Assert(!device.empty());
1138 CV_Assert(device.ptr());
1139 OpenCLExecutionContext ctx;
1140 ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1145 void OpenCLExecutionContext::release()
1147 CV_TRACE_FUNCTION();
1153 // true if we have initialized OpenCL subsystem with available platforms
1154 static bool g_isOpenCLInitialized = false;
1155 static bool g_isOpenCLAvailable = false;
1159 CV_TRACE_FUNCTION();
1161 if (!g_isOpenCLInitialized)
1163 CV_TRACE_REGION("Init_OpenCL_Runtime");
1164 const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1167 if (cv::String(envPath) == "disabled")
1169 g_isOpenCLAvailable = false;
1170 g_isOpenCLInitialized = true;
1175 cv::AutoLock lock(getInitializationMutex());
1176 CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1180 g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1181 g_isOpenCLAvailable &= n > 0;
1182 CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1186 g_isOpenCLAvailable = false;
1188 g_isOpenCLInitialized = true;
1190 return g_isOpenCLAvailable;
1195 CoreTLSData& data = getCoreTlsData();
1196 if (data.useOpenCL < 0)
1203 auto c = OpenCLExecutionContext::getCurrent();
1204 data.useOpenCL = c.useOpenCL();
1209 CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1212 return data.useOpenCL > 0;
1215 bool isOpenCLActivated()
1217 if (!g_isOpenCLAvailable)
1218 return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1222 void setUseOpenCL(bool flag)
1224 CV_TRACE_FUNCTION();
1226 CoreTLSData& data = getCoreTlsData();
1227 auto& c = OpenCLExecutionContext::getCurrentRef();
1230 c.setUseOpenCL(flag);
1231 data.useOpenCL = c.useOpenCL();
1238 data.useOpenCL = -1; // enabled by default (if context is not initialized)
1244 #ifdef HAVE_CLAMDBLAS
1249 static AmdBlasHelper & getInstance()
1251 CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1254 bool isAvailable() const
1256 return g_isAmdBlasAvailable;
1263 clAmdBlasTeardown();
1271 if (!g_isAmdBlasInitialized)
1273 AutoLock lock(getInitializationMutex());
1275 if (!g_isAmdBlasInitialized)
1281 g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1285 g_isAmdBlasAvailable = false;
1289 g_isAmdBlasAvailable = false;
1291 g_isAmdBlasInitialized = true;
1297 static bool g_isAmdBlasInitialized;
1298 static bool g_isAmdBlasAvailable;
1301 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1302 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1306 return AmdBlasHelper::getInstance().isAvailable();
1318 #ifdef HAVE_CLAMDFFT
1323 static AmdFftHelper & getInstance()
1325 CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1328 bool isAvailable() const
1330 return g_isAmdFftAvailable;
1337 // clAmdFftTeardown();
1345 if (!g_isAmdFftInitialized)
1347 AutoLock lock(getInitializationMutex());
1349 if (!g_isAmdFftInitialized)
1355 cl_uint major, minor, patch;
1356 CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1358 // it throws exception in case AmdFft binaries are not found
1359 CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1360 g_isAmdFftAvailable = true;
1362 catch (const Exception &)
1364 g_isAmdFftAvailable = false;
1368 g_isAmdFftAvailable = false;
1370 g_isAmdFftInitialized = true;
1376 static clAmdFftSetupData setupData;
1377 static bool g_isAmdFftInitialized;
1378 static bool g_isAmdFftAvailable;
1381 clAmdFftSetupData AmdFftHelper::setupData;
1382 bool AmdFftHelper::g_isAmdFftAvailable = false;
1383 bool AmdFftHelper::g_isAmdFftInitialized = false;
1387 return AmdFftHelper::getInstance().isAvailable();
1401 #ifdef HAVE_OPENCL_SVM
1410 Queue::getDefault().finish();
1413 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1415 struct Platform::Impl
1421 initialized = false;
1430 //cl_uint num_entries
1432 if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1438 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1440 vendor = String(buf);
1447 IMPLEMENT_REFCOUNTABLE();
1449 cl_platform_id handle;
1454 Platform::Platform()
1459 Platform::~Platform()
1465 Platform::Platform(const Platform& pl)
1472 Platform& Platform::operator = (const Platform& pl)
1474 Impl* newp = (Impl*)pl.p;
1483 void* Platform::ptr() const
1485 return p ? p->handle : 0;
1488 Platform& Platform::getDefault()
1490 CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1500 /////////////////////////////////////// Device ////////////////////////////////////////////
1502 // Version has format:
1503 // OpenCL<space><major_version.minor_version><space><vendor-specific information>
1505 // http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1506 // http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1507 // https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html
1508 // https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html
1509 static void parseOpenCLVersion(const String &version, int &major, int &minor)
1512 if (10 >= version.length())
1514 const char *pstr = version.c_str();
1515 if (0 != strncmp(pstr, "OpenCL ", 7))
1517 size_t ppos = version.find('.', 7);
1518 if (String::npos == ppos)
1520 String temp = version.substr(7, ppos - 7);
1521 major = atoi(temp.c_str());
1522 temp = version.substr(ppos + 1);
1523 minor = atoi(temp.c_str());
1534 cl_device_id device = (cl_device_id)d;
1536 CV_OCL_CHECK(clRetainDevice(device)); // increment reference counter on success only
1544 void _init(cl_device_id d)
1546 handle = (cl_device_id)d;
1548 name_ = getStrProp(CL_DEVICE_NAME);
1549 version_ = getStrProp(CL_DEVICE_VERSION);
1550 extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1551 doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1552 hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1553 maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1554 maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1555 type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1556 driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1557 addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1559 String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1560 parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1563 while (pos < extensions_.size())
1565 size_t pos2 = extensions_.find(' ', pos);
1566 if (pos2 == String::npos)
1567 pos2 = extensions_.size();
1570 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1571 extensions_set_.insert(extensionName);
1576 intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1578 vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1579 if (vendorName_ == "Advanced Micro Devices, Inc." ||
1580 vendorName_ == "AMD")
1581 vendorID_ = VENDOR_AMD;
1582 else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1583 vendorID_ = VENDOR_INTEL;
1584 else if (vendorName_ == "NVIDIA Corporation")
1585 vendorID_ = VENDOR_NVIDIA;
1587 vendorID_ = UNKNOWN_VENDOR;
1589 const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1590 if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1592 const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1593 if (new_maxWorkGroupSize != maxWorkGroupSize_)
1594 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1595 maxWorkGroupSize_ = new_maxWorkGroupSize;
1598 if (isExtensionSupported("cl_khr_spir"))
1600 #ifndef CL_DEVICE_SPIR_VERSIONS
1601 #define CL_DEVICE_SPIR_VERSIONS 0x40E0
1603 cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1604 std::cout << spir_versions << std::endl;
1612 if (!cv::__termination)
1617 CV_OCL_CHECK(clReleaseDevice(handle));
1623 template<typename _TpCL, typename _TpOut>
1624 _TpOut getProp(cl_device_info prop) const
1629 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1630 sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1633 bool getBoolProp(cl_device_info prop) const
1635 cl_bool temp = CL_FALSE;
1638 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1639 sz == sizeof(temp) ? temp != 0 : false;
1642 String getStrProp(cl_device_info prop) const
1646 return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1647 sz < sizeof(buf) ? String(buf) : String();
1650 bool isExtensionSupported(const std::string& extensionName) const
1652 return extensions_set_.count(extensionName) > 0;
1656 IMPLEMENT_REFCOUNTABLE();
1658 cl_device_id handle;
1662 std::string extensions_;
1663 int doubleFPConfig_;
1664 bool hostUnifiedMemory_;
1665 int maxComputeUnits_;
1666 size_t maxWorkGroupSize_;
1669 int deviceVersionMajor_;
1670 int deviceVersionMinor_;
1671 String driverVersion_;
1674 bool intelSubgroupsSupport_;
1676 std::set<std::string> extensions_set_;
1685 Device::Device(void* d)
1691 Device::Device(const Device& d)
1698 Device& Device::operator = (const Device& d)
1700 Impl* newp = (Impl*)d.p;
1715 void Device::set(void* d)
1722 CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1726 Device Device::fromHandle(void* d)
1732 void* Device::ptr() const
1734 return p ? p->handle : 0;
1737 String Device::name() const
1738 { return p ? p->name_ : String(); }
1740 String Device::extensions() const
1741 { return p ? String(p->extensions_) : String(); }
1743 bool Device::isExtensionSupported(const String& extensionName) const
1744 { return p ? p->isExtensionSupported(extensionName) : false; }
1746 String Device::version() const
1747 { return p ? p->version_ : String(); }
1749 String Device::vendorName() const
1750 { return p ? p->vendorName_ : String(); }
1752 int Device::vendorID() const
1753 { return p ? p->vendorID_ : 0; }
1755 String Device::OpenCL_C_Version() const
1756 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1758 String Device::OpenCLVersion() const
1759 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1761 int Device::deviceVersionMajor() const
1762 { return p ? p->deviceVersionMajor_ : 0; }
1764 int Device::deviceVersionMinor() const
1765 { return p ? p->deviceVersionMinor_ : 0; }
1767 String Device::driverVersion() const
1768 { return p ? p->driverVersion_ : String(); }
1770 int Device::type() const
1771 { return p ? p->type_ : 0; }
1773 int Device::addressBits() const
1774 { return p ? p->addressBits_ : 0; }
1776 bool Device::available() const
1777 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1779 bool Device::compilerAvailable() const
1780 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1782 bool Device::linkerAvailable() const
1783 #ifdef CL_VERSION_1_2
1784 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1786 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1789 int Device::doubleFPConfig() const
1790 { return p ? p->doubleFPConfig_ : 0; }
1792 int Device::singleFPConfig() const
1793 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1795 int Device::halfFPConfig() const
1796 #ifdef CL_VERSION_1_2
1797 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1799 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1802 bool Device::endianLittle() const
1803 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1805 bool Device::errorCorrectionSupport() const
1806 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1808 int Device::executionCapabilities() const
1809 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1811 size_t Device::globalMemCacheSize() const
1812 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1814 int Device::globalMemCacheType() const
1815 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1817 int Device::globalMemCacheLineSize() const
1818 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1820 size_t Device::globalMemSize() const
1821 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1823 size_t Device::localMemSize() const
1824 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1826 int Device::localMemType() const
1827 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1829 bool Device::hostUnifiedMemory() const
1830 { return p ? p->hostUnifiedMemory_ : false; }
1832 bool Device::imageSupport() const
1833 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1835 bool Device::imageFromBufferSupport() const
1837 return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1840 uint Device::imagePitchAlignment() const
1842 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1843 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1849 uint Device::imageBaseAddressAlignment() const
1851 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1852 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1858 size_t Device::image2DMaxWidth() const
1859 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1861 size_t Device::image2DMaxHeight() const
1862 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1864 size_t Device::image3DMaxWidth() const
1865 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1867 size_t Device::image3DMaxHeight() const
1868 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1870 size_t Device::image3DMaxDepth() const
1871 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1873 size_t Device::imageMaxBufferSize() const
1874 #ifdef CL_VERSION_1_2
1875 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1877 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1880 size_t Device::imageMaxArraySize() const
1881 #ifdef CL_VERSION_1_2
1882 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1884 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1887 bool Device::intelSubgroupsSupport() const
1888 { return p ? p->intelSubgroupsSupport_ : false; }
1890 int Device::maxClockFrequency() const
1891 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1893 int Device::maxComputeUnits() const
1894 { return p ? p->maxComputeUnits_ : 0; }
1896 int Device::maxConstantArgs() const
1897 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1899 size_t Device::maxConstantBufferSize() const
1900 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1902 size_t Device::maxMemAllocSize() const
1903 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1905 size_t Device::maxParameterSize() const
1906 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1908 int Device::maxReadImageArgs() const
1909 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1911 int Device::maxWriteImageArgs() const
1912 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1914 int Device::maxSamplers() const
1915 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1917 size_t Device::maxWorkGroupSize() const
1918 { return p ? p->maxWorkGroupSize_ : 0; }
1920 int Device::maxWorkItemDims() const
1921 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1923 void Device::maxWorkItemSizes(size_t* sizes) const
1927 const int MAX_DIMS = 32;
1929 CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1930 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1934 int Device::memBaseAddrAlign() const
1935 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1937 int Device::nativeVectorWidthChar() const
1938 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1940 int Device::nativeVectorWidthShort() const
1941 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1943 int Device::nativeVectorWidthInt() const
1944 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1946 int Device::nativeVectorWidthLong() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1949 int Device::nativeVectorWidthFloat() const
1950 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1952 int Device::nativeVectorWidthDouble() const
1953 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1955 int Device::nativeVectorWidthHalf() const
1956 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1958 int Device::preferredVectorWidthChar() const
1959 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1961 int Device::preferredVectorWidthShort() const
1962 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1964 int Device::preferredVectorWidthInt() const
1965 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1967 int Device::preferredVectorWidthLong() const
1968 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1970 int Device::preferredVectorWidthFloat() const
1971 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1973 int Device::preferredVectorWidthDouble() const
1974 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1976 int Device::preferredVectorWidthHalf() const
1977 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1979 size_t Device::printfBufferSize() const
1980 #ifdef CL_VERSION_1_2
1981 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1983 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1987 size_t Device::profilingTimerResolution() const
1988 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1990 const Device& Device::getDefault()
1992 auto& c = OpenCLExecutionContext::getCurrent();
1995 return c.getDevice();
1998 static Device dummy;
2002 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2004 template <typename Functor, typename ObjectType>
2005 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2008 cl_int err = f(obj, name, 0, NULL, &required);
2009 if (err != CL_SUCCESS)
2015 AutoBuffer<char> buf(required + 1);
2016 char* ptr = buf.data(); // cleanup is not needed
2017 err = f(obj, name, required, ptr, NULL);
2018 if (err != CL_SUCCESS)
2026 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2031 std::istringstream ss(s);
2035 std::getline(ss, item, delim);
2036 elems.push_back(item);
2040 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2042 // Sample: AMD:GPU:Tahiti
2043 // Sample: :GPU|CPU: = '' = ':' = '::'
2044 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2045 std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2047 std::vector<std::string> parts;
2048 split(configurationStr, ':', parts);
2049 if (parts.size() > 3)
2051 CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2054 if (parts.size() > 2)
2055 deviceNameOrID = parts[2];
2056 if (parts.size() > 1)
2058 split(parts[1], '|', deviceTypes);
2060 if (parts.size() > 0)
2062 platform = parts[0];
2067 #if defined WINRT || defined _WIN32_WCE
2068 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2070 CV_UNUSED(configuration)
2074 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2076 std::string platform, deviceName;
2077 std::vector<std::string> deviceTypes;
2080 configuration = getenv("OPENCV_OPENCL_DEVICE");
2082 if (configuration &&
2083 (strcmp(configuration, "disabled") == 0 ||
2084 !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2090 if (deviceName.length() == 1)
2091 // We limit ID range to 0..9, because we want to write:
2092 // - '2500' to mean i5-2500
2093 // - '8350' to mean AMD FX-8350
2094 // - '650' to mean GeForce 650
2095 // To extend ID range change condition to '> 0'
2098 for (size_t i = 0; i < deviceName.length(); i++)
2100 if (!isdigit(deviceName[i]))
2108 deviceID = atoi(deviceName.c_str());
2114 std::vector<cl_platform_id> platforms;
2116 cl_uint numPlatforms = 0;
2117 CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2119 if (numPlatforms == 0)
2121 platforms.resize((size_t)numPlatforms);
2122 CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2123 platforms.resize(numPlatforms);
2126 int selectedPlatform = -1;
2127 if (platform.length() > 0)
2129 for (size_t i = 0; i < platforms.size(); i++)
2132 CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2133 if (name.find(platform) != std::string::npos)
2135 selectedPlatform = (int)i;
2139 if (selectedPlatform == -1)
2141 CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2145 if (deviceTypes.size() == 0)
2149 deviceTypes.push_back("GPU");
2151 deviceTypes.push_back("CPU");
2154 deviceTypes.push_back("ALL");
2156 for (size_t t = 0; t < deviceTypes.size(); t++)
2159 std::string tempStrDeviceType = deviceTypes[t];
2160 std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2162 if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2163 deviceType = Device::TYPE_GPU;
2164 else if (tempStrDeviceType == "cpu")
2165 deviceType = Device::TYPE_CPU;
2166 else if (tempStrDeviceType == "accelerator")
2167 deviceType = Device::TYPE_ACCELERATOR;
2168 else if (tempStrDeviceType == "all")
2169 deviceType = Device::TYPE_ALL;
2172 CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2176 std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2177 for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2178 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2182 cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2183 if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2185 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2189 size_t base = devices.size();
2190 devices.resize(base + count);
2191 status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2192 if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2194 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2198 for (size_t i = (isID ? deviceID : 0);
2199 (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2203 CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2204 cl_bool useGPU = true;
2205 if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2207 cl_bool isIGPU = CL_FALSE;
2208 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2209 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2211 if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2213 // TODO check for OpenCL 1.1
2221 return NULL; // suppress messages on stderr
2223 std::ostringstream msg;
2224 msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2225 << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2226 << " Device types:";
2227 for (size_t t = 0; t < deviceTypes.size(); t++)
2228 msg << ' ' << deviceTypes[t];
2230 msg << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2232 CV_LOG_ERROR(NULL, msg.str());
2237 #ifdef HAVE_OPENCL_SVM
2240 enum AllocatorFlags { // don't use first 16 bits
2241 OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2242 OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2243 OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2244 OPENCL_SVM_BUFFER_MASK = 3 << 16,
2245 OPENCL_SVM_BUFFER_MAP = 4 << 16
2248 static bool checkForceSVMUmatUsage()
2250 static bool initialized = false;
2251 static bool force = false;
2254 force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2259 static bool checkDisableSVMUMatUsage()
2261 static bool initialized = false;
2262 static bool force = false;
2265 force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2270 static bool checkDisableSVM()
2272 static bool initialized = false;
2273 static bool force = false;
2276 force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2281 // see SVMCapabilities
2282 static unsigned int getSVMCapabilitiesMask()
2284 static bool initialized = false;
2285 static unsigned int mask = 0;
2288 const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2289 if (envValue == NULL)
2291 return ~0U; // all bits 1
2293 mask = atoi(envValue);
2301 static size_t getProgramCountLimit()
2303 static bool initialized = false;
2304 static size_t count = 0;
2307 count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2313 static int g_contextId = 0;
2315 class OpenCLBufferPoolImpl;
2316 class OpenCLSVMBufferPoolImpl;
2318 struct Context::Impl
2320 static Context::Impl* get(Context& context) { return context.p; }
2322 typedef std::deque<Context::Impl*> container_t;
2323 static container_t& getGlobalContainer()
2325 // never delete this container (Impl lifetime is greater due to TLS storage)
2326 static container_t* g_contexts = new container_t();
2331 Impl(const std::string& configuration_)
2333 , contextId(CV_XADD(&g_contextId, 1))
2334 , configuration(configuration_)
2339 #ifdef HAVE_OPENCL_SVM
2340 , svmInitialized(false)
2344 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2346 cv::AutoLock lock(cv::getInitializationMutex());
2347 auto& container = getGlobalContainer();
2348 container.resize(std::max(container.size(), (size_t)contextId + 1));
2349 container[contextId] = this;
2355 if (!cv::__termination)
2360 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2365 directx::internal::deleteDirectXImpl(&p_directx_impl);
2370 cv::AutoLock lock(cv::getInitializationMutex());
2371 auto& container = getGlobalContainer();
2372 CV_CheckLT((size_t)contextId, container.size(), "");
2373 container[contextId] = NULL;
2377 void init_device_list()
2381 cl_uint ndevices = 0;
2382 CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2383 CV_Assert(ndevices > 0);
2385 cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2386 size_t devices_ret_size = 0;
2387 CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2388 CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2391 for (unsigned i = 0; i < ndevices; i++)
2393 devices.emplace_back(Device::fromHandle(cl_devices[i]));
2397 void __init_buffer_pools(); // w/o synchronization
2398 void _init_buffer_pools() const
2402 cv::AutoLock lock(cv::getInitializationMutex());
2405 const_cast<Impl*>(this)->__init_buffer_pools();
2410 static Impl* findContext(const std::string& configuration)
2412 CV_TRACE_FUNCTION();
2413 cv::AutoLock lock(cv::getInitializationMutex());
2414 auto& container = getGlobalContainer();
2415 if (configuration.empty() && !container.empty())
2416 return container[0];
2417 for (auto it = container.begin(); it != container.end(); ++it)
2420 if (i && i->configuration == configuration)
2428 static Impl* findOrCreateContext(const std::string& configuration_)
2430 CV_TRACE_FUNCTION();
2431 std::string configuration = configuration_;
2432 if (configuration_.empty())
2434 const char* c = getenv("OPENCV_OPENCL_DEVICE");
2438 Impl* impl = findContext(configuration);
2441 CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2446 cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2450 impl = new Impl(configuration);
2453 impl->createFromDevice(d);
2466 static Impl* findOrCreateContext(cl_context h)
2468 CV_TRACE_FUNCTION();
2472 std::string configuration = cv::format("@ctx-%p", (void*)h);
2473 Impl* impl = findContext(configuration);
2476 CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2481 impl = new Impl(configuration);
2484 CV_OCL_CHECK(clRetainContext(h));
2486 impl->init_device_list();
2496 static Impl* findOrCreateContext(const ocl::Device& device)
2498 CV_TRACE_FUNCTION();
2500 CV_Assert(!device.empty());
2501 cl_device_id d = (cl_device_id)device.ptr();
2504 std::string configuration = cv::format("@dev-%p", (void*)d);
2505 Impl* impl = findContext(configuration);
2508 CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2513 impl = new Impl(configuration);
2516 impl->createFromDevice(d);
2517 CV_Assert(impl->handle);
2529 CV_TRACE_FUNCTION();
2530 cl_device_id d = selectOpenCLDevice();
2535 createFromDevice(d);
2538 void createFromDevice(cl_device_id d)
2540 CV_TRACE_FUNCTION();
2541 CV_Assert(handle == NULL);
2543 cl_platform_id pl = NULL;
2544 CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2546 cl_context_properties prop[] =
2548 CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2552 // !!! in the current implementation force the number of devices to 1 !!!
2556 handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2557 CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2559 bool ok = handle != 0 && status == CL_SUCCESS;
2569 Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2571 void unloadProg(Program& prog)
2573 cv::AutoLock lock(program_cache_mutex);
2574 for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2576 phash_t::iterator it = phash.find(*i);
2577 if (it != phash.end())
2579 if (it->second.ptr() == prog.ptr())
2589 std::string& getPrefixString()
2593 cv::AutoLock lock(program_cache_mutex);
2596 CV_Assert(!devices.empty());
2597 const Device& d = devices[0];
2598 int bits = d.addressBits();
2599 if (bits > 0 && bits != 64)
2600 prefix = cv::format("%d-bit--", bits);
2601 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2603 for (size_t i = 0; i < prefix.size(); i++)
2606 if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2616 std::string& getPrefixBase()
2618 if (prefix_base.empty())
2620 cv::AutoLock lock(program_cache_mutex);
2621 if (prefix_base.empty())
2623 const Device& d = devices[0];
2624 int bits = d.addressBits();
2625 if (bits > 0 && bits != 64)
2626 prefix_base = cv::format("%d-bit--", bits);
2627 prefix_base += d.vendorName() + "--" + d.name() + "--";
2629 for (size_t i = 0; i < prefix_base.size(); i++)
2631 char c = prefix_base[i];
2632 if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2634 prefix_base[i] = '_';
2642 IMPLEMENT_REFCOUNTABLE();
2644 const int contextId; // global unique ID
2645 const std::string configuration;
2648 std::vector<Device> devices;
2651 std::string prefix_base;
2653 cv::Mutex program_cache_mutex;
2654 typedef std::map<std::string, Program> phash_t;
2656 typedef std::list<cv::String> CacheList;
2657 CacheList cacheList;
2659 std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2660 std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2661 OpenCLBufferPoolImpl& getBufferPool() const
2663 _init_buffer_pools();
2664 CV_DbgAssert(bufferPool_);
2665 return *bufferPool_.get();
2667 OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2669 _init_buffer_pools();
2670 CV_DbgAssert(bufferPoolHostPtr_);
2671 return *bufferPoolHostPtr_.get();
2675 directx::internal::OpenCLDirectXImpl* p_directx_impl;
2677 directx::internal::OpenCLDirectXImpl* getDirectXImpl()
2679 if (!p_directx_impl)
2681 p_directx_impl = directx::internal::createDirectXImpl();
2683 return p_directx_impl;
2687 #ifdef HAVE_OPENCL_SVM
2688 bool svmInitialized;
2691 svm::SVMCapabilities svmCapabilities;
2692 svm::SVMFunctions svmFunctions;
2696 CV_Assert(handle != NULL);
2697 const Device& device = devices[0];
2698 cl_device_svm_capabilities deviceCaps = 0;
2699 CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2700 cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2701 if (status != CL_SUCCESS)
2703 CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2706 CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2707 CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2708 svmCapabilities.value_ =
2709 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2710 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2711 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2712 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2713 svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2714 if (svmCapabilities.value_ == 0)
2716 CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2722 CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2723 void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2726 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2727 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2732 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2733 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2735 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2736 CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2741 ((int*)ptr)[0] = 100;
2745 CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2748 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2750 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2751 CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2756 CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2761 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2762 clSVMFree(handle, ptr);
2765 clSVMFree(handle, ptr);
2766 svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2767 svmFunctions.fn_clSVMFree = clSVMFree;
2768 svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2769 //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2770 //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2771 svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2772 svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2773 svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2774 svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2778 CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2781 // Try HSA extension
2782 String extensions = device.extensions();
2783 if (extensions.find("cl_amd_svm") == String::npos)
2785 CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2788 cl_platform_id p = NULL;
2789 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2790 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2791 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2792 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2793 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2794 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2795 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2796 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2797 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2798 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2799 CV_Assert(svmFunctions.isValid());
2803 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2808 svmAvailable = true;
2809 svmEnabled = !svm::checkDisableSVM();
2810 svmInitialized = true;
2811 CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2814 CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2815 svmAvailable = false;
2817 svmCapabilities.value_ = 0;
2818 svmInitialized = true;
2819 svmFunctions.fn_clSVMAlloc = NULL;
2823 std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2825 OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2827 _init_buffer_pools();
2828 CV_DbgAssert(bufferPoolSVM_);
2829 return *bufferPoolSVM_.get();
2833 friend class Program;
2848 Context::Context(int dtype)
2854 void Context::release()
2863 bool Context::create()
2868 p = Impl::findOrCreateContext(std::string());
2876 bool Context::create(int dtype)
2881 if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2883 p = Impl::findOrCreateContext("");
2885 else if (dtype == CL_DEVICE_TYPE_GPU)
2887 p = Impl::findOrCreateContext(":GPU:");
2889 else if (dtype == CL_DEVICE_TYPE_CPU)
2891 p = Impl::findOrCreateContext(":CPU:");
2895 CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2897 if (p && !p->handle)
2904 Context::Context(const Context& c)
2911 Context& Context::operator = (const Context& c)
2913 Impl* newp = (Impl*)c.p;
2922 void* Context::ptr() const
2924 return p == NULL ? NULL : p->handle;
2927 size_t Context::ndevices() const
2929 return p ? p->devices.size() : 0;
2932 Device& Context::device(size_t idx) const
2934 static Device dummy;
2935 return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2938 Context& Context::getDefault(bool initialize)
2940 auto& c = OpenCLExecutionContext::getCurrent();
2943 auto& ctx = c.getContext();
2947 CV_UNUSED(initialize);
2948 static Context dummy;
2952 Program Context::getProg(const ProgramSource& prog,
2953 const String& buildopts, String& errmsg)
2955 return p ? p->getProg(prog, buildopts, errmsg) : Program();
2958 void Context::unloadProg(Program& prog)
2961 p->unloadProg(prog);
2965 Context Context::fromHandle(void* context)
2968 ctx.p = Impl::findOrCreateContext((cl_context)context);
2973 Context Context::fromDevice(const ocl::Device& device)
2976 ctx.p = Impl::findOrCreateContext(device);
2981 Context Context::create(const std::string& configuration)
2984 ctx.p = Impl::findOrCreateContext(configuration);
2988 #ifdef HAVE_OPENCL_SVM
2989 bool Context::useSVM() const
2991 Context::Impl* i = p;
2993 if (!i->svmInitialized)
2995 return i->svmEnabled;
2997 void Context::setUseSVM(bool enabled)
2999 Context::Impl* i = p;
3001 if (!i->svmInitialized)
3003 if (enabled && !i->svmAvailable)
3005 CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
3007 i->svmEnabled = enabled;
3010 bool Context::useSVM() const { return false; }
3011 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
3014 #ifdef HAVE_OPENCL_SVM
3017 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
3019 Context::Impl* i = context.p;
3021 if (!i->svmInitialized)
3023 return i->svmCapabilities;
3026 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3028 Context::Impl* i = context.p;
3030 CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3031 CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3032 return &i->svmFunctions;
3035 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3037 if (checkForceSVMUmatUsage())
3039 if (checkDisableSVMUMatUsage())
3041 if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3043 return false; // don't use SVM by default
3046 } // namespace cv::ocl::svm
3047 #endif // HAVE_OPENCL_SVM
3050 static void get_platform_name(cl_platform_id id, String& name)
3052 // get platform name string length
3054 CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3056 // get platform name string
3057 AutoBuffer<char> buf(sz + 1);
3058 CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3060 // just in case, ensure trailing zero for ASCIIZ string
3067 // Attaches OpenCL context to OpenCV
3069 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3071 auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3076 OpenCLExecutionContext OpenCLExecutionContext::create(
3077 const std::string& platformName, void* platformID, void* context, void* deviceID
3081 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3084 CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3087 CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3089 std::vector<cl_platform_id> platforms(cnt);
3091 CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3093 bool platformAvailable = false;
3095 // check if external platformName contained in list of available platforms in OpenCV
3096 for (unsigned int i = 0; i < cnt; i++)
3098 String availablePlatformName;
3099 get_platform_name(platforms[i], availablePlatformName);
3100 // external platform is found in the list of available platforms
3101 if (platformName == availablePlatformName)
3103 platformAvailable = true;
3108 if (!platformAvailable)
3109 CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3111 // check if platformID corresponds to platformName
3112 String actualPlatformName;
3113 get_platform_name((cl_platform_id)platformID, actualPlatformName);
3114 if (platformName != actualPlatformName)
3115 CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3117 OpenCLExecutionContext ctx;
3118 ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3119 CV_OCL_CHECK(clReleaseContext((cl_context)context));
3120 CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3124 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3126 // internal call, less checks
3127 cl_platform_id platformID = (cl_platform_id)_platform;
3128 cl_context context = (cl_context)_context;
3129 cl_device_id deviceID = (cl_device_id)_device;
3131 std::string platformName = PlatformInfo(&platformID).name();
3133 auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3134 CV_Assert(!clExecCtx.empty());
3135 ctx = clExecCtx.getContext();
3138 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3142 inline void __init()
3146 isProfilingQueue_ = false;
3149 Impl(cl_command_queue q)
3154 cl_command_queue_properties props = 0;
3155 CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3156 isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3159 Impl(cl_command_queue q, bool isProfilingQueue)
3163 isProfilingQueue_ = isProfilingQueue;
3166 Impl(const Context& c, const Device& d, bool withProfiling = false)
3170 const Context* pc = &c;
3171 cl_context ch = (cl_context)pc->ptr();
3174 pc = &Context::getDefault();
3175 ch = (cl_context)pc->ptr();
3177 cl_device_id dh = (cl_device_id)d.ptr();
3179 dh = (cl_device_id)pc->device(0).ptr();
3181 cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3182 CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3183 isProfilingQueue_ = withProfiling;
3189 if (!cv::__termination)
3194 CV_OCL_DBG_CHECK(clFinish(handle));
3195 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3201 const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3203 if (isProfilingQueue_)
3206 if (profiling_queue_.ptr())
3207 return profiling_queue_;
3210 CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3212 cl_device_id device = 0;
3213 CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3215 cl_int result = CL_SUCCESS;
3216 cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3217 cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3218 CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3221 queue.p = new Impl(q, true);
3222 profiling_queue_ = queue;
3224 return profiling_queue_;
3227 IMPLEMENT_REFCOUNTABLE();
3229 cl_command_queue handle;
3230 bool isProfilingQueue_;
3231 cv::ocl::Queue profiling_queue_;
3239 Queue::Queue(const Context& c, const Device& d)
3245 Queue::Queue(const Queue& q)
3252 Queue& Queue::operator = (const Queue& q)
3254 Impl* newp = (Impl*)q.p;
3269 bool Queue::create(const Context& c, const Device& d)
3274 return p->handle != 0;
3277 void Queue::finish()
3281 CV_OCL_DBG_CHECK(clFinish(p->handle));
3285 const Queue& Queue::getProfilingQueue() const
3288 return p->getProfilingQueue(*this);
3291 void* Queue::ptr() const
3293 return p ? p->handle : 0;
3296 Queue& Queue::getDefault()
3298 auto& c = OpenCLExecutionContext::getCurrent();
3301 auto& q = c.getQueue();
3308 static cl_command_queue getQueue(const Queue& q)
3310 cl_command_queue qq = (cl_command_queue)q.ptr();
3312 qq = (cl_command_queue)Queue::getDefault().ptr();
3316 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3318 KernelArg::KernelArg()
3319 : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3323 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3324 : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3326 CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3329 KernelArg KernelArg::Constant(const Mat& m)
3331 CV_Assert(m.isContinuous());
3332 return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3335 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3339 Impl(const char* kname, const Program& prog) :
3340 refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3342 cl_program ph = (cl_program)prog.ptr();
3347 handle = clCreateKernel(ph, kname, &retval);
3348 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3350 for( int i = 0; i < MAX_ARRS; i++ )
3352 haveTempDstUMats = false;
3353 haveTempSrcUMats = false;
3358 for( int i = 0; i < MAX_ARRS; i++ )
3361 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3363 u[i]->flags |= UMatData::ASYNC_CLEANUP;
3364 u[i]->currAllocator->deallocate(u[i]);
3369 haveTempDstUMats = false;
3370 haveTempSrcUMats = false;
3373 void addUMat(const UMat& m, bool dst)
3375 CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3377 CV_XADD(&m.u->urefcount, 1);
3379 if(dst && m.u->tempUMat())
3380 haveTempDstUMats = true;
3381 if(m.u->originalUMatData == NULL && m.u->tempUMat())
3382 haveTempSrcUMats = true; // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3385 /// Preserve image lifetime (while it is specified as Kernel argument)
3386 void registerImageArgument(int arg, const Image2D& image)
3388 CV_CheckGE(arg, 0, "");
3389 CV_CheckLT(arg, (int)MAX_ARRS, "");
3390 if (arg < (int)shadow_images.size() && shadow_images[arg].ptr() != image.ptr()) // TODO future: replace ptr => impl (more strong check)
3392 CV_Check(arg, !isInProgress, "ocl::Kernel: clearing of pending Image2D arguments is not allowed");
3394 shadow_images.reserve(MAX_ARRS);
3395 shadow_images.resize(std::max(shadow_images.size(), (size_t)arg + 1));
3396 shadow_images[arg] = image;
3399 void finit(cl_event e)
3403 isInProgress = false;
3407 bool run(int dims, size_t _globalsize[], size_t _localsize[],
3408 bool sync, int64* timeNS, const Queue& q);
3414 CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3418 IMPLEMENT_REFCOUNTABLE();
3422 enum { MAX_ARRS = 16 };
3423 UMatData* u[MAX_ARRS];
3425 bool isAsyncRun; // true if kernel was scheduled in async mode
3427 std::vector<Image2D> shadow_images;
3428 bool haveTempDstUMats;
3429 bool haveTempSrcUMats;
3432 }} // namespace cv::ocl
3436 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3440 ((cv::ocl::Kernel::Impl*)p)->finit(e);
3442 catch (const cv::Exception& exc)
3444 CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3446 catch (const std::exception& exc)
3448 CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3452 CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3458 namespace cv { namespace ocl {
3465 Kernel::Kernel(const char* kname, const Program& prog)
3468 create(kname, prog);
3471 Kernel::Kernel(const char* kname, const ProgramSource& src,
3472 const String& buildopts, String* errmsg)
3475 create(kname, src, buildopts, errmsg);
3478 Kernel::Kernel(const Kernel& k)
3485 Kernel& Kernel::operator = (const Kernel& k)
3487 Impl* newp = (Impl*)k.p;
3502 bool Kernel::create(const char* kname, const Program& prog)
3506 p = new Impl(kname, prog);
3512 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3518 bool Kernel::create(const char* kname, const ProgramSource& src,
3519 const String& buildopts, String* errmsg)
3527 if( !errmsg ) errmsg = &tempmsg;
3528 const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3529 return create(kname, prog);
3532 void* Kernel::ptr() const
3534 return p ? p->handle : 0;
3537 bool Kernel::empty() const
3542 static cv::String dumpValue(size_t sz, const void* p)
3545 return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
3547 return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p);
3548 return cv::format("%p", p);
3551 int Kernel::set(int i, const void* value, size_t sz)
3553 if (!p || !p->handle)
3560 cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3561 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str());
3562 if (retval != CL_SUCCESS)
3567 int Kernel::set(int i, const Image2D& image2D)
3569 cl_mem h = (cl_mem)image2D.ptr();
3570 int res = set(i, &h, sizeof(h));
3572 p->registerImageArgument(i, image2D);
3576 int Kernel::set(int i, const UMat& m)
3578 return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3581 int Kernel::set(int i, const KernelArg& arg)
3583 if( !p || !p->handle )
3587 CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3588 p->name.c_str(), (int)i));
3596 AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3597 ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3598 bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3599 if (ptronly && arg.m->empty())
3601 cl_mem h_null = (cl_mem)NULL;
3602 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3603 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3606 cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3610 CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3611 p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3617 #ifdef HAVE_OPENCL_SVM
3618 if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3620 const Context& ctx = Context::getDefault();
3621 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3622 uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3623 CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3625 status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3627 status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3629 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3634 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3635 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3642 else if( arg.m->dims <= 2 )
3645 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3646 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3647 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3648 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3651 if( !(arg.flags & KernelArg::NO_SIZE) )
3653 int cols = u2d.cols*arg.wscale/arg.iwscale;
3654 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3655 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3656 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3657 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3664 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3665 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3666 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3667 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3668 status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3669 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3671 if( !(arg.flags & KernelArg::NO_SIZE) )
3673 int cols = u3d.cols*arg.wscale/arg.iwscale;
3674 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3675 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3676 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3677 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3678 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3679 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3683 p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3686 status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3687 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3691 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3692 bool sync, const Queue& q)
3697 size_t globalsize[CV_MAX_DIM] = {1,1,1};
3699 CV_Assert(_globalsize != NULL);
3700 for (int i = 0; i < dims; i++)
3702 size_t val = _localsize ? _localsize[i] :
3703 dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3704 CV_Assert( val > 0 );
3705 total *= _globalsize[i];
3706 if (_globalsize[i] == 1 && !_localsize)
3708 globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3710 CV_Assert(total > 0);
3712 return p->run(dims, globalsize, _localsize, sync, NULL, q);
3716 static bool isRaiseErrorOnReuseAsyncKernel()
3718 static bool initialized = false;
3719 static bool value = false;
3722 value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3728 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3729 bool sync, int64* timeNS, const Queue& q)
3731 CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3735 CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3741 CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3742 if (isRaiseErrorOnReuseAsyncKernel())
3744 return false; // OpenCV 5.0: raise error
3750 CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3751 if (isRaiseErrorOnReuseAsyncKernel())
3753 return false; // OpenCV 5.0: raise error
3756 cl_command_queue qq = getQueue(q);
3757 if (haveTempDstUMats)
3759 if (haveTempSrcUMats)
3763 cl_event asyncEvent = 0;
3764 cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3765 NULL, globalsize, localsize, 0, 0,
3766 (sync && !timeNS) ? 0 : &asyncEvent);
3767 #if !CV_OPENCL_SHOW_RUN_KERNELS
3768 if (retval != CL_SUCCESS)
3771 cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3772 globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3773 (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3774 sync ? "true" : "false"
3776 if (retval != CL_SUCCESS)
3778 msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3780 #if CV_OPENCL_TRACE_CHECK
3781 CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3783 printf("%s\n", msg.c_str());
3787 if (sync || retval != CL_SUCCESS)
3789 CV_OCL_DBG_CHECK(clFinish(qq));
3792 if (retval == CL_SUCCESS)
3794 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3795 cl_ulong startTime, stopTime;
3796 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3797 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3798 *timeNS = (int64)(stopTime - startTime);
3810 isInProgress = true;
3811 CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3814 CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3815 return retval == CL_SUCCESS;
3818 bool Kernel::runTask(bool sync, const Queue& q)
3820 if(!p || !p->handle || p->isInProgress)
3823 cl_command_queue qq = getQueue(q);
3824 cl_event asyncEvent = 0;
3825 cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3826 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3827 if (sync || retval != CL_SUCCESS)
3829 CV_OCL_DBG_CHECK(clFinish(qq));
3835 p->isInProgress = true;
3836 CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3839 CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3840 return retval == CL_SUCCESS;
3843 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3845 CV_Assert(p && p->handle && !p->isInProgress);
3846 Queue q = q_.ptr() ? q_ : Queue::getDefault();
3848 q.finish(); // call clFinish() on base queue
3849 Queue profilingQueue = q.getProfilingQueue();
3851 bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3852 return res ? timeNs : -1;
3855 size_t Kernel::workGroupSize() const
3857 if(!p || !p->handle)
3859 size_t val = 0, retsz = 0;
3860 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3861 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3862 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3863 return status == CL_SUCCESS ? val : 0;
3866 size_t Kernel::preferedWorkGroupSizeMultiple() const
3868 if(!p || !p->handle)
3870 size_t val = 0, retsz = 0;
3871 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3872 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3873 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3874 return status == CL_SUCCESS ? val : 0;
3877 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3879 if(!p || !p->handle || !wsz)
3882 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3883 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3884 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3885 return status == CL_SUCCESS;
3888 size_t Kernel::localMemSize() const
3890 if(!p || !p->handle)
3894 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3895 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3896 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3897 return status == CL_SUCCESS ? (size_t)val : 0;
3902 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3904 struct ProgramSource::Impl
3906 IMPLEMENT_REFCOUNTABLE();
3909 PROGRAM_SOURCE_CODE = 0,
3915 Impl(const String& src)
3917 init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3918 initFromSource(src, cv::String());
3920 Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3922 init(PROGRAM_SOURCE_CODE, module, name);
3923 initFromSource(codeStr, codeHash);
3927 void init(enum KIND kind, const String& module, const String& name)
3936 isHashUpdated = false;
3939 void initFromSource(const String& codeStr, const String& codeHash)
3942 sourceHash_ = codeHash;
3943 if (sourceHash_.empty())
3949 isHashUpdated = true;
3953 void updateHash(const char* hashStr = NULL)
3957 sourceHash_ = cv::String(hashStr);
3958 isHashUpdated = true;
3964 case PROGRAM_SOURCE_CODE:
3967 CV_Assert(codeStr_.empty());
3968 hash = crc64(sourceAddr_, sourceSize_); // static storage
3972 CV_Assert(!codeStr_.empty());
3973 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3976 case PROGRAM_BINARIES:
3979 hash = crc64(sourceAddr_, sourceSize_);
3982 CV_Error(Error::StsInternal, "Internal error");
3984 sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3985 isHashUpdated = true;
3988 Impl(enum KIND kind,
3989 const String& module, const String& name,
3990 const unsigned char* binary, const size_t size,
3991 const cv::String& buildOptions = cv::String())
3993 init(kind, module, name);
3995 sourceAddr_ = binary;
3998 buildOptions_ = buildOptions;
4001 static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
4002 const char* sourceCodeStaticStr, const char* hashStaticStr,
4003 const cv::String& buildOptions)
4005 ProgramSource result;
4006 result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
4007 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
4008 result.p->updateHash(hashStaticStr);
4012 static ProgramSource fromBinary(const String& module, const String& name,
4013 const unsigned char* binary, const size_t size,
4014 const cv::String& buildOptions)
4016 ProgramSource result;
4017 result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
4021 static ProgramSource fromSPIR(const String& module, const String& name,
4022 const unsigned char* binary, const size_t size,
4023 const cv::String& buildOptions)
4025 ProgramSource result;
4026 result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
4033 // TODO std::vector<ProgramSource> includes_;
4034 String codeStr_; // PROGRAM_SOURCE_CODE only
4036 const unsigned char* sourceAddr_;
4039 cv::String buildOptions_;
4044 friend struct Program::Impl;
4045 friend struct internal::ProgramEntry;
4046 friend struct Context::Impl;
4050 ProgramSource::ProgramSource()
4055 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4057 p = new Impl(module, name, codeStr, codeHash);
4060 ProgramSource::ProgramSource(const char* prog)
4065 ProgramSource::ProgramSource(const String& prog)
4070 ProgramSource::~ProgramSource()
4076 ProgramSource::ProgramSource(const ProgramSource& prog)
4083 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4085 Impl* newp = (Impl*)prog.p;
4094 const String& ProgramSource::source() const
4097 CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4098 CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4102 ProgramSource::hash_t ProgramSource::hash() const
4104 CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4107 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4108 const unsigned char* binary, const size_t size,
4109 const cv::String& buildOptions)
4112 CV_Assert(size > 0);
4113 return Impl::fromBinary(module, name, binary, size, buildOptions);
4116 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4117 const unsigned char* binary, const size_t size,
4118 const cv::String& buildOptions)
4121 CV_Assert(size > 0);
4122 return Impl::fromBinary(module, name, binary, size, buildOptions);
4126 internal::ProgramEntry::operator ProgramSource&() const
4128 if (this->pProgramSource == NULL)
4130 cv::AutoLock lock(cv::getInitializationMutex());
4131 if (this->pProgramSource == NULL)
4133 ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4134 ProgramSource* ptr = new ProgramSource(ps);
4135 const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4138 return *this->pProgramSource;
4143 /////////////////////////////////////////// Program /////////////////////////////////////////////
4146 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4154 return a + (cv::String(" ") + b);
4157 struct Program::Impl
4159 IMPLEMENT_REFCOUNTABLE();
4161 Impl(const ProgramSource& src,
4162 const String& _buildflags, String& errmsg) :
4165 buildflags(_buildflags)
4167 const ProgramSource::Impl* src_ = src.getImpl();
4169 sourceModule_ = src_->module_;
4170 sourceName_ = src_->name_;
4171 const Context ctx = Context::getDefault();
4172 Device device = ctx.device(0);
4173 if (ctx.ptr() == NULL || device.ptr() == NULL)
4175 buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4176 if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4179 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4180 else if (device.isIntel())
4181 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4182 const String param_buildExtraOptions = getBuildExtraOptions();
4183 if (!param_buildExtraOptions.empty())
4184 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4186 compile(ctx, src_, errmsg);
4189 bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4191 CV_Assert(ctx.getImpl());
4194 // We don't cache OpenCL binaries
4195 if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4197 CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4198 bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4201 return compileWithCache(ctx, src_, errmsg);
4204 bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4206 CV_Assert(ctx.getImpl());
4208 CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4210 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4211 OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4212 const std::string base_dir = config.prepareCacheDirectoryForContext(
4213 ctx.getImpl()->getPrefixString(),
4214 ctx.getImpl()->getPrefixBase()
4216 const String& hash_str = src_->sourceHash_;
4218 if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4220 CV_Assert(!hash_str.empty());
4221 fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4222 fname = utils::fs::join(base_dir, fname);
4224 const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4225 if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4229 std::vector<char> binaryBuf;
4232 cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4233 BinaryProgramFile file(fname, hash_str.c_str());
4234 res = file.read(buildflags, binaryBuf);
4238 CV_Assert(!binaryBuf.empty());
4239 CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4240 bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4245 catch (const cv::Exception& e)
4248 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4252 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4255 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4256 CV_Assert(handle == NULL);
4257 if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4259 if (!buildFromSources(ctx, src_, errmsg))
4264 else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4266 buildflags = joinBuildOptions(buildflags, " -x spir");
4267 if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4269 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4271 CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4272 bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4276 else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4278 CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4282 CV_Error(Error::StsInternal, "Internal error");
4284 CV_Assert(handle != NULL);
4285 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4286 if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4290 std::vector<char> binaryBuf;
4291 getProgramBinary(binaryBuf);
4293 cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4294 BinaryProgramFile file(fname, hash_str.c_str());
4295 file.write(buildflags, binaryBuf);
4298 catch (const cv::Exception& e)
4300 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4304 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4307 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4308 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4309 if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4311 std::vector<char> binaryBuf;
4312 getProgramBinary(binaryBuf);
4313 if (!binaryBuf.empty())
4315 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4317 createFromBinary(ctx, binaryBuf, errmsg);
4321 return handle != NULL;
4324 void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4326 AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4329 cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4330 CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4331 if (log_retval == CL_SUCCESS && retsz > 1)
4333 buffer.resize(retsz + 16);
4334 log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4335 CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4336 if (log_retval == CL_SUCCESS)
4338 if (retsz < buffer.size())
4341 buffer[buffer.size() - 1] = 0;
4349 errmsg = String(buffer.data());
4350 printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4351 sourceModule_.c_str(), sourceName_.c_str(),
4352 result, getOpenCLErrorString(result),
4353 buildflags.c_str(), errmsg.c_str());
4357 bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4360 CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4361 CV_Assert(handle == NULL);
4362 CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4363 sourceModule_.c_str(), sourceName_.c_str(),
4364 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4366 CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4368 const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4369 size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4370 CV_Assert(srcptr != NULL);
4371 CV_Assert(srclen > 0);
4375 handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4376 CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4377 CV_Assert(handle || retval != CL_SUCCESS);
4378 if (handle && retval == CL_SUCCESS)
4380 size_t n = ctx.ndevices();
4381 AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4382 cl_device_id* deviceList = deviceListBuf.data();
4383 for (size_t i = 0; i < n; i++)
4385 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4388 retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4389 CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4390 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4391 if (retval != CL_SUCCESS)
4394 dumpBuildLog_(retval, deviceList, errmsg);
4396 // don't remove "retval != CL_SUCCESS" condition here:
4397 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4398 if (retval != CL_SUCCESS && handle)
4400 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4404 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4405 if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4407 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4409 char kernels_buffer[4096] = {0};
4410 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4411 if (retsz < sizeof(kernels_buffer))
4412 kernels_buffer[retsz] = 0;
4414 kernels_buffer[0] = 0;
4415 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4420 return handle != NULL;
4423 void getProgramBinary(std::vector<char>& buf)
4427 CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4429 uchar* ptr = (uchar*)&buf[0];
4430 CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4433 bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4435 return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4438 bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4440 CV_Assert(handle == NULL);
4441 CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4442 CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4444 CV_Assert(binarySize > 0);
4446 size_t ndevices = (int)ctx.ndevices();
4447 AutoBuffer<cl_device_id> devices_(ndevices);
4448 AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4449 AutoBuffer<size_t> binarySizes_(ndevices);
4451 cl_device_id* devices = devices_.data();
4452 const uchar** binaryPtrs = binaryPtrs_.data();
4453 size_t* binarySizes = binarySizes_.data();
4454 for (size_t i = 0; i < ndevices; i++)
4456 devices[i] = (cl_device_id)ctx.device(i).ptr();
4457 binaryPtrs[i] = binaryAddr;
4458 binarySizes[i] = binarySize;
4462 handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4463 binarySizes, binaryPtrs, NULL, &result);
4464 if (result != CL_SUCCESS)
4466 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4469 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4477 // call clBuildProgram()
4479 result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4480 CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4481 if (result != CL_SUCCESS)
4483 dumpBuildLog_(result, devices, errmsg);
4486 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4492 // check build status
4494 cl_build_status build_status = CL_BUILD_NONE;
4496 CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4497 sizeof(build_status), &build_status, &retsz));
4498 if (result == CL_SUCCESS)
4500 if (build_status == CL_BUILD_SUCCESS)
4506 CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4512 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4515 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4520 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4521 if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4523 CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4525 char kernels_buffer[4096] = {0};
4526 result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4527 if (retsz < sizeof(kernels_buffer))
4528 kernels_buffer[retsz] = 0;
4530 kernels_buffer[0] = 0;
4531 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4534 return handle != NULL;
4542 if (!cv::__termination)
4545 clReleaseProgram(handle);
4554 String sourceModule_;
4559 Program::Program() { p = 0; }
4561 Program::Program(const ProgramSource& src,
4562 const String& buildflags, String& errmsg)
4565 create(src, buildflags, errmsg);
4568 Program::Program(const Program& prog)
4575 Program& Program::operator = (const Program& prog)
4577 Impl* newp = (Impl*)prog.p;
4592 bool Program::create(const ProgramSource& src,
4593 const String& buildflags, String& errmsg)
4600 p = new Impl(src, buildflags, errmsg);
4609 void* Program::ptr() const
4611 return p ? p->handle : 0;
4614 #ifndef OPENCV_REMOVE_DEPRECATED_API
4615 const ProgramSource& Program::source() const
4617 CV_Error(Error::StsNotImplemented, "Removed API");
4620 bool Program::read(const String& bin, const String& buildflags)
4622 CV_UNUSED(bin); CV_UNUSED(buildflags);
4623 CV_Error(Error::StsNotImplemented, "Removed API");
4626 bool Program::write(String& bin) const
4629 CV_Error(Error::StsNotImplemented, "Removed API");
4632 String Program::getPrefix() const
4636 Context::Impl* ctx_ = Context::getDefault().getImpl();
4638 return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4641 String Program::getPrefix(const String& buildflags)
4643 Context::Impl* ctx_ = Context::getDefault().getImpl();
4645 return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4647 #endif // OPENCV_REMOVE_DEPRECATED_API
4649 void Program::getBinary(std::vector<char>& binary) const
4651 CV_Assert(p && "Empty program");
4652 p->getProgramBinary(binary);
4655 Program Context::Impl::getProg(const ProgramSource& src,
4656 const String& buildflags, String& errmsg)
4658 size_t limit = getProgramCountLimit();
4659 const ProgramSource::Impl* src_ = src.getImpl();
4661 String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4662 src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4663 getPrefixString().c_str(),
4664 buildflags.c_str());
4666 cv::AutoLock lock(program_cache_mutex);
4667 phash_t::iterator it = phash.find(key);
4668 if (it != phash.end())
4671 CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4672 if (i != cacheList.end() && i != cacheList.begin())
4675 cacheList.push_front(key);
4679 { // cleanup program cache
4680 size_t sz = phash.size();
4681 if (limit > 0 && sz >= limit)
4683 static bool warningFlag = false;
4686 printf("\nWARNING: OpenCV-OpenCL:\n"
4687 " In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4688 " You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4691 while (!cacheList.empty())
4693 size_t c = phash.erase(cacheList.back());
4694 cacheList.pop_back();
4701 Program prog(src, buildflags, errmsg);
4702 // Cache result of build failures too (to prevent unnecessary compiler invocations)
4704 cv::AutoLock lock(program_cache_mutex);
4705 phash.insert(std::pair<std::string, Program>(key, prog));
4706 cacheList.push_front(key);
4712 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4714 template<typename T>
4715 class OpenCLBufferPool
4718 ~OpenCLBufferPool() { }
4720 virtual T allocate(size_t size) = 0;
4721 virtual void release(T buffer) = 0;
4724 template <typename Derived, typename BufferEntry, typename T>
4725 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4728 inline Derived& derived() { return *static_cast<Derived*>(this); }
4732 size_t currentReservedSize;
4733 size_t maxReservedSize;
4735 std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4736 std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4739 bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4741 typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4742 for (; i != allocatedEntries_.end(); ++i)
4744 BufferEntry& e = *i;
4745 if (e.clBuffer_ == buffer)
4748 allocatedEntries_.erase(i);
4756 bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4758 if (reservedEntries_.empty())
4760 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4761 typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4763 size_t minDiff = (size_t)(-1);
4764 for (; i != reservedEntries_.end(); ++i)
4766 BufferEntry& e = *i;
4767 if (e.capacity_ >= size)
4769 size_t diff = e.capacity_ - size;
4770 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4780 if (result_pos != reservedEntries_.end())
4782 //CV_DbgAssert(result == *result_pos);
4783 reservedEntries_.erase(result_pos);
4785 currentReservedSize -= entry.capacity_;
4786 allocatedEntries_.push_back(entry);
4793 void _checkSizeOfReservedEntries()
4795 while (currentReservedSize > maxReservedSize)
4797 CV_DbgAssert(!reservedEntries_.empty());
4798 const BufferEntry& entry = reservedEntries_.back();
4799 CV_DbgAssert(currentReservedSize >= entry.capacity_);
4800 currentReservedSize -= entry.capacity_;
4801 derived()._releaseBufferEntry(entry);
4802 reservedEntries_.pop_back();
4806 inline size_t _allocationGranularity(size_t size)
4809 if (size < 1024*1024)
4810 return 4096; // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4811 else if (size < 16*1024*1024)
4818 OpenCLBufferPoolBaseImpl()
4819 : currentReservedSize(0),
4824 virtual ~OpenCLBufferPoolBaseImpl()
4826 freeAllReservedBuffers();
4827 CV_Assert(reservedEntries_.empty());
4830 virtual T allocate(size_t size) CV_OVERRIDE
4832 AutoLock locker(mutex_);
4834 if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4836 CV_DbgAssert(size <= entry.capacity_);
4837 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4841 derived()._allocateBufferEntry(entry, size);
4843 return entry.clBuffer_;
4845 virtual void release(T buffer) CV_OVERRIDE
4847 AutoLock locker(mutex_);
4849 CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4850 if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4852 derived()._releaseBufferEntry(entry);
4856 reservedEntries_.push_front(entry);
4857 currentReservedSize += entry.capacity_;
4858 _checkSizeOfReservedEntries();
4862 virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4863 virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4864 virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4866 AutoLock locker(mutex_);
4867 size_t oldMaxReservedSize = maxReservedSize;
4868 maxReservedSize = size;
4869 if (maxReservedSize < oldMaxReservedSize)
4871 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4872 for (; i != reservedEntries_.end();)
4874 const BufferEntry& entry = *i;
4875 if (entry.capacity_ > maxReservedSize / 8)
4877 CV_DbgAssert(currentReservedSize >= entry.capacity_);
4878 currentReservedSize -= entry.capacity_;
4879 derived()._releaseBufferEntry(entry);
4880 i = reservedEntries_.erase(i);
4885 _checkSizeOfReservedEntries();
4888 virtual void freeAllReservedBuffers() CV_OVERRIDE
4890 AutoLock locker(mutex_);
4891 typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4892 for (; i != reservedEntries_.end(); ++i)
4894 const BufferEntry& entry = *i;
4895 derived()._releaseBufferEntry(entry);
4897 reservedEntries_.clear();
4898 currentReservedSize = 0;
4902 struct CLBufferEntry
4906 CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4909 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4912 typedef struct CLBufferEntry BufferEntry;
4916 OpenCLBufferPoolImpl(int createFlags = 0)
4917 : createFlags_(createFlags)
4921 void _allocateBufferEntry(BufferEntry& entry, size_t size)
4923 CV_DbgAssert(entry.clBuffer_ == NULL);
4924 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4925 Context& ctx = Context::getDefault();
4926 cl_int retval = CL_SUCCESS;
4927 entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4928 CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4929 CV_Assert(entry.clBuffer_ != NULL);
4930 if(retval == CL_SUCCESS)
4932 CV_IMPL_ADD(CV_IMPL_OCL);
4934 LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4935 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4936 allocatedEntries_.push_back(entry);
4939 void _releaseBufferEntry(const BufferEntry& entry)
4941 CV_Assert(entry.capacity_ != 0);
4942 CV_Assert(entry.clBuffer_ != NULL);
4943 LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4944 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4945 CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4949 #ifdef HAVE_OPENCL_SVM
4950 struct CLSVMBufferEntry
4954 CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4956 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4959 typedef struct CLSVMBufferEntry BufferEntry;
4961 OpenCLSVMBufferPoolImpl()
4965 void _allocateBufferEntry(BufferEntry& entry, size_t size)
4967 CV_DbgAssert(entry.clBuffer_ == NULL);
4968 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4970 Context& ctx = Context::getDefault();
4971 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4972 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4973 cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4974 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4976 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4977 CV_DbgAssert(svmFns->isValid());
4979 CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4980 void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4983 entry.clBuffer_ = buf;
4985 CV_IMPL_ADD(CV_IMPL_OCL);
4987 LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4988 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4989 allocatedEntries_.push_back(entry);
4992 void _releaseBufferEntry(const BufferEntry& entry)
4994 CV_Assert(entry.capacity_ != 0);
4995 CV_Assert(entry.clBuffer_ != NULL);
4996 LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4997 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4998 Context& ctx = Context::getDefault();
4999 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5000 CV_DbgAssert(svmFns->isValid());
5001 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", entry.clBuffer_);
5002 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
5009 template <bool readAccess, bool writeAccess>
5010 class AlignedDataPtr
5014 uchar* const originPtr_;
5015 const size_t alignment_;
5017 uchar* allocatedPtr_;
5020 AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
5021 : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
5023 CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5024 CV_DbgAssert(!readAccess || ptr);
5025 if (((size_t)ptr_ & (alignment - 1)) != 0)
5027 allocatedPtr_ = new uchar[size_ + alignment - 1];
5028 ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5031 memcpy(ptr_, originPtr_, size_);
5036 uchar* getAlignedPtr() const
5038 CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5048 memcpy(originPtr_, ptr_, size_);
5050 delete[] allocatedPtr_;
5051 allocatedPtr_ = NULL;
5056 AlignedDataPtr(const AlignedDataPtr&); // disabled
5057 AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5060 template <bool readAccess, bool writeAccess>
5061 class AlignedDataPtr2D
5065 uchar* const originPtr_;
5066 const size_t alignment_;
5068 uchar* allocatedPtr_;
5074 AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5075 : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5077 CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5078 CV_DbgAssert(!readAccess || ptr != NULL);
5079 if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5081 allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5082 ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5085 for (size_t i = 0; i < rows_; i++)
5086 memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5091 uchar* getAlignedPtr() const
5093 CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5103 for (size_t i = 0; i < rows_; i++)
5104 memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5106 delete[] allocatedPtr_;
5107 allocatedPtr_ = NULL;
5112 AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5113 AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5116 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5117 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5121 void Context::Impl::__init_buffer_pools()
5123 bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5124 OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5125 bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5126 OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5128 size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5129 size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5130 bufferPool.setMaxReservedSize(poolSize);
5131 size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5132 bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5134 #ifdef HAVE_OPENCL_SVM
5135 bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5136 OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5137 size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5138 bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5141 CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5144 class OpenCLAllocator CV_FINAL : public MatAllocator
5149 ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5150 ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5151 #ifdef HAVE_OPENCL_SVM
5152 ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5154 ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3 // convertFromBuffer()
5159 matStdAllocator = Mat::getDefaultAllocator();
5163 flushCleanupQueue();
5166 UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5167 AccessFlag flags, UMatUsageFlags usageFlags) const
5169 UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5173 static bool isOpenCLMapForced() // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5175 static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5178 static bool isOpenCLCopyingForced() // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5180 static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5184 void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5186 const Device& dev = ctx.device(0);
5188 if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5189 createFlags |= CL_MEM_ALLOC_HOST_PTR;
5191 if (!isOpenCLCopyingForced() &&
5192 (isOpenCLMapForced() ||
5193 (dev.hostUnifiedMemory()
5200 flags0 = static_cast<UMatData::MemoryFlag>(0);
5202 flags0 = UMatData::COPY_ON_MAP;
5205 UMatData* allocate(int dims, const int* sizes, int type,
5206 void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5209 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5211 flushCleanupQueue();
5213 CV_Assert(data == 0);
5214 size_t total = CV_ELEM_SIZE(type);
5215 for( int i = dims-1; i >= 0; i-- )
5222 Context& ctx = Context::getDefault();
5224 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5225 Context::Impl& ctxImpl = *ctx.getImpl();
5227 int createFlags = 0;
5228 UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5229 getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5231 void* handle = NULL;
5232 int allocatorFlags = 0;
5234 #ifdef HAVE_OPENCL_SVM
5235 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5236 if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5238 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5239 handle = ctxImpl.getBufferPoolSVM().allocate(total);
5241 // this property is constant, so single buffer pool can be used here
5242 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5243 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5247 if (createFlags == 0)
5249 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5250 handle = ctxImpl.getBufferPool().allocate(total);
5252 else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5254 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5255 handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5259 CV_Assert(handle != NULL); // Unsupported, throw
5263 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5265 UMatData* u = new UMatData(this);
5270 u->allocatorFlags_ = allocatorFlags;
5271 u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5272 CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5273 u->markHostCopyObsolete(true);
5274 opencl_allocator_stats.onAllocate(u->size);
5278 bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5283 flushCleanupQueue();
5285 UMatDataAutoLock lock(u);
5289 CV_Assert(u->origdata != 0);
5290 Context& ctx = Context::getDefault();
5291 int createFlags = 0;
5292 UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5293 getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5295 bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5297 cl_context ctx_handle = (cl_context)ctx.ptr();
5298 int allocatorFlags = 0;
5299 UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5300 void* handle = NULL;
5301 cl_int retval = CL_SUCCESS;
5303 #ifdef HAVE_OPENCL_SVM
5304 svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5305 bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5306 if (useSVM && svmCaps.isSupportFineGrainSystem())
5308 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5309 tempUMatFlags = UMatData::TEMP_UMAT;
5310 handle = u->origdata;
5311 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5313 else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5315 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5317 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5319 cl_svm_mem_flags memFlags = createFlags |
5320 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5322 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5323 CV_DbgAssert(svmFns->isValid());
5325 CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5326 handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5329 cl_command_queue q = NULL;
5330 if (!isFineGrainBuffer)
5332 q = (cl_command_queue)Queue::getDefault().ptr();
5333 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5334 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5337 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5340 memcpy(handle, u->origdata, u->size);
5341 if (!isFineGrainBuffer)
5343 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5344 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5345 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5348 tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5349 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5350 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5357 accessFlags &= ~ACCESS_FAST;
5359 tempUMatFlags = UMatData::TEMP_UMAT;
5364 CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5365 // There are OpenCL runtime issues for less aligned data
5366 && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5367 && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5368 // Avoid sharing of host memory between OpenCL buffers
5369 && !(u->originalUMatData && u->originalUMatData->handle)
5372 handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5373 u->size, u->origdata, &retval);
5374 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5375 (long long int)u->size, u->origdata, (void*)handle).c_str());
5377 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5379 handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5380 u->size, u->origdata, &retval);
5381 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5382 (long long int)u->size, u->origdata, (void*)handle).c_str());
5383 tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5386 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5387 if(!handle || retval != CL_SUCCESS)
5390 u->prevAllocator = u->currAllocator;
5391 u->currAllocator = this;
5392 u->flags |= tempUMatFlags | flags0;
5393 u->allocatorFlags_ = allocatorFlags;
5395 if (!!(accessFlags & ACCESS_WRITE))
5396 u->markHostCopyObsolete(true);
5397 opencl_allocator_stats.onAllocate(u->size);
5401 /*void sync(UMatData* u) const
5403 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5404 UMatDataAutoLock lock(u);
5406 if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5408 if( u->tempCopiedUMat() )
5410 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5411 u->size, u->origdata, 0, 0, 0);
5416 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5417 (CL_MAP_READ | CL_MAP_WRITE),
5418 0, u->size, 0, 0, 0, &retval);
5419 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5422 u->markHostCopyObsolete(false);
5424 else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5426 clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5427 u->size, u->data, 0, 0, 0);
5431 void deallocate(UMatData* u) const CV_OVERRIDE
5436 CV_Assert(u->urefcount == 0);
5437 CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5439 CV_Assert(u->handle != 0);
5440 CV_Assert(u->mapcount == 0);
5442 if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5443 addToCleanupQueue(u);
5448 void deallocate_(UMatData* u) const
5451 CV_Assert(u->handle);
5452 if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5454 opencl_allocator_stats.onFree(u->size);
5458 if (cv::__termination) // process is not in consistent state (after ExitProcess call) and terminating
5459 return; // avoid any OpenCL calls
5463 CV_Assert(u->origdata);
5464 // UMatDataAutoLock lock(u);
5466 if (u->hostCopyObsolete())
5468 #ifdef HAVE_OPENCL_SVM
5469 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5471 Context& ctx = Context::getDefault();
5472 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5473 CV_DbgAssert(svmFns->isValid());
5475 if( u->tempCopiedUMat() )
5477 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5478 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5479 bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5480 cl_command_queue q = NULL;
5481 if (!isFineGrainBuffer)
5483 CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5484 q = (cl_command_queue)Queue::getDefault().ptr();
5485 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5486 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5489 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5492 memcpy(u->origdata, u->handle, u->size);
5493 if (!isFineGrainBuffer)
5495 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5496 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5497 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5502 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5509 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5510 if( u->tempCopiedUMat() )
5512 AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5513 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5514 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5521 CV_Assert(u->mapcount == 0);
5522 flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5523 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5524 (CL_MAP_READ | CL_MAP_WRITE),
5525 0, u->size, 0, 0, 0, &retval);
5526 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5527 CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5528 if (u->originalUMatData)
5530 CV_Assert(u->originalUMatData->data == data);
5532 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5533 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5534 CV_OCL_DBG_CHECK(clFinish(q));
5538 u->markHostCopyObsolete(false);
5544 #ifdef HAVE_OPENCL_SVM
5545 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5547 if( u->tempCopiedUMat() )
5549 Context& ctx = Context::getDefault();
5550 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5551 CV_DbgAssert(svmFns->isValid());
5553 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5554 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5560 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5561 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5564 u->markDeviceCopyObsolete(true);
5565 u->currAllocator = u->prevAllocator;
5566 u->prevAllocator = NULL;
5567 if(u->data && u->copyOnMap() && u->data != u->origdata)
5569 u->data = u->origdata;
5570 u->currAllocator->deallocate(u);
5575 CV_Assert(u->origdata == NULL);
5576 if(u->data && u->copyOnMap() && u->data != u->origdata)
5580 u->markHostCopyObsolete(true);
5582 if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5584 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5586 ocl::Context& ctx = *pCtx.get();
5587 CV_Assert(ctx.getImpl());
5588 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5590 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5592 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5594 ocl::Context& ctx = *pCtx.get();
5595 CV_Assert(ctx.getImpl());
5596 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5598 #ifdef HAVE_OPENCL_SVM
5599 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5601 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5603 ocl::Context& ctx = *pCtx.get();
5604 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5608 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5609 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5611 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5612 CV_DbgAssert(svmFns->isValid());
5613 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5615 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5617 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5618 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5619 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5622 CV_Assert(ctx.getImpl());
5623 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5628 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5631 u->markDeviceCopyObsolete(true);
5635 CV_Assert(u == NULL);
5638 // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5639 void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5641 CV_Assert(u && u->handle);
5643 if (!!(accessFlags & ACCESS_WRITE))
5644 u->markDeviceCopyObsolete(true);
5646 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5649 if( !u->copyOnMap() )
5652 // because there can be other map requests for the same UMat with different access flags,
5653 // we use the universal (read-write) access mode.
5654 #ifdef HAVE_OPENCL_SVM
5655 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5657 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5659 Context& ctx = Context::getDefault();
5660 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5661 CV_DbgAssert(svmFns->isValid());
5663 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5665 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5666 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5669 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5670 u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5674 u->data = (uchar*)u->handle;
5675 u->markHostCopyObsolete(false);
5676 u->markDeviceMemMapped(true);
5681 cl_int retval = CL_SUCCESS;
5682 if (!u->deviceMemMapped())
5684 CV_Assert(u->refcount == 1);
5685 CV_Assert(u->mapcount++ == 0);
5686 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5687 (CL_MAP_READ | CL_MAP_WRITE),
5688 0, u->size, 0, 0, 0, &retval);
5689 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5691 if (u->data && retval == CL_SUCCESS)
5693 u->markHostCopyObsolete(false);
5694 u->markDeviceMemMapped(true);
5698 // TODO Is it really a good idea and was it tested well?
5699 // if map failed, switch to copy-on-map mode for the particular buffer
5700 u->flags |= UMatData::COPY_ON_MAP;
5705 u->data = (uchar*)fastMalloc(u->size);
5706 u->markHostCopyObsolete(true);
5710 if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5712 AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5713 #ifdef HAVE_OPENCL_SVM
5714 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5716 cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5717 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5718 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5719 (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5720 u->markHostCopyObsolete(false);
5724 void unmap(UMatData* u) const CV_OVERRIDE
5730 CV_Assert(u->handle != 0);
5732 UMatDataAutoLock autolock(u);
5734 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5736 if( !u->copyOnMap() && u->deviceMemMapped() )
5738 CV_Assert(u->data != NULL);
5739 #ifdef HAVE_OPENCL_SVM
5740 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5742 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5744 Context& ctx = Context::getDefault();
5745 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5746 CV_DbgAssert(svmFns->isValid());
5748 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5750 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5751 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5753 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5755 u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5758 if (u->refcount == 0)
5760 u->markDeviceCopyObsolete(false);
5761 u->markHostCopyObsolete(true);
5765 if (u->refcount == 0)
5767 CV_Assert(u->mapcount-- == 1);
5768 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5769 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5770 if (Device::getDefault().isAMD())
5772 // required for multithreaded applications (see stitching test)
5773 CV_OCL_DBG_CHECK(clFinish(q));
5775 u->markDeviceMemMapped(false);
5777 u->markDeviceCopyObsolete(false);
5778 u->markHostCopyObsolete(true);
5781 else if( u->copyOnMap() && u->deviceCopyObsolete() )
5783 AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5784 #ifdef HAVE_OPENCL_SVM
5785 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5787 retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5788 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5789 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5790 (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5791 u->markDeviceCopyObsolete(false);
5792 u->markHostCopyObsolete(true);
5796 bool checkContinuous(int dims, const size_t sz[],
5797 const size_t srcofs[], const size_t srcstep[],
5798 const size_t dstofs[], const size_t dststep[],
5799 size_t& total, size_t new_sz[],
5800 size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5801 size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5803 bool iscontinuous = true;
5804 srcrawofs = srcofs ? srcofs[dims-1] : 0;
5805 dstrawofs = dstofs ? dstofs[dims-1] : 0;
5807 for( int i = dims-2; i >= 0; i-- )
5809 if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5810 iscontinuous = false;
5813 srcrawofs += srcofs[i]*srcstep[i];
5815 dstrawofs += dstofs[i]*dststep[i];
5820 // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5823 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5824 // we assume that new_... arrays are initialized by caller
5825 // with 0's, so there is no else branch
5828 new_srcofs[0] = srcofs[1];
5829 new_srcofs[1] = srcofs[0];
5835 new_dstofs[0] = dstofs[1];
5836 new_dstofs[1] = dstofs[0];
5840 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5841 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5845 // we could check for dims == 3 here,
5846 // but from user perspective this one is more informative
5847 CV_Assert(dims <= 3);
5848 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5851 new_srcofs[0] = srcofs[2];
5852 new_srcofs[1] = srcofs[1];
5853 new_srcofs[2] = srcofs[0];
5858 new_dstofs[0] = dstofs[2];
5859 new_dstofs[1] = dstofs[1];
5860 new_dstofs[2] = dstofs[0];
5863 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5864 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5867 return iscontinuous;
5870 void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5871 const size_t srcofs[], const size_t srcstep[],
5872 const size_t dststep[]) const CV_OVERRIDE
5876 UMatDataAutoLock autolock(u);
5878 if( u->data && !u->hostCopyObsolete() )
5880 Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5883 CV_Assert( u->handle != 0 );
5885 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5887 size_t total = 0, new_sz[] = {0, 0, 0};
5888 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5889 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5891 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5893 srcrawofs, new_srcofs, new_srcstep,
5894 dstrawofs, new_dstofs, new_dststep);
5896 #ifdef HAVE_OPENCL_SVM
5897 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5899 CV_DbgAssert(u->data == NULL || u->data == u->handle);
5900 Context& ctx = Context::getDefault();
5901 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5902 CV_DbgAssert(svmFns->isValid());
5904 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5905 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5907 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5908 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5911 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5916 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5920 // This code is from MatAllocator::download()
5921 int isz[CV_MAX_DIM];
5922 uchar* srcptr = (uchar*)u->handle;
5923 for( int i = 0; i < dims; i++ )
5925 CV_Assert( sz[i] <= (size_t)INT_MAX );
5929 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5930 isz[i] = (int)sz[i];
5933 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5934 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5936 const Mat* arrays[] = { &src, &dst };
5938 NAryMatIterator it(arrays, ptrs, 2);
5939 size_t j, planesz = it.size;
5941 for( j = 0; j < it.nplanes; j++, ++it )
5942 memcpy(ptrs[1], ptrs[0], planesz);
5944 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5946 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5947 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5949 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5958 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5959 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5960 srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5962 else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5964 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5965 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5966 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5967 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5968 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5969 uchar* ptr = alignedPtr.getAlignedPtr();
5971 CV_Assert(new_srcstep[0] >= new_sz[0]);
5972 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5973 total = std::min(total, u->size - new_srcrawofs);
5974 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5975 new_srcrawofs, total, ptr, 0, 0, 0));
5976 for( size_t i = 0; i < new_sz[1]; i++ )
5977 memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5981 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5982 uchar* ptr = alignedPtr.getAlignedPtr();
5984 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5985 new_srcofs, new_dstofs, new_sz,
5993 void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5994 const size_t dstofs[], const size_t dststep[],
5995 const size_t srcstep[]) const CV_OVERRIDE
6000 // there should be no user-visible CPU copies of the UMat which we are going to copy to
6001 CV_Assert(u->refcount == 0 || u->tempUMat());
6003 size_t total = 0, new_sz[] = {0, 0, 0};
6004 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6005 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6007 bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
6009 srcrawofs, new_srcofs, new_srcstep,
6010 dstrawofs, new_dstofs, new_dststep);
6012 UMatDataAutoLock autolock(u);
6014 // if there is cached CPU copy of the GPU matrix,
6015 // we could use it as a destination.
6016 // we can do it in 2 cases:
6017 // 1. we overwrite the whole content
6018 // 2. we overwrite part of the matrix, but the GPU copy is out-of-date
6019 if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
6021 Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
6022 u->markHostCopyObsolete(false);
6023 u->markDeviceCopyObsolete(true);
6027 CV_Assert( u->handle != 0 );
6028 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6030 #ifdef HAVE_OPENCL_SVM
6031 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6033 CV_DbgAssert(u->data == NULL || u->data == u->handle);
6034 Context& ctx = Context::getDefault();
6035 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6036 CV_DbgAssert(svmFns->isValid());
6038 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
6039 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6041 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
6042 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
6045 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6050 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6054 // This code is from MatAllocator::upload()
6055 int isz[CV_MAX_DIM];
6056 uchar* dstptr = (uchar*)u->handle;
6057 for( int i = 0; i < dims; i++ )
6059 CV_Assert( sz[i] <= (size_t)INT_MAX );
6063 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6064 isz[i] = (int)sz[i];
6067 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6068 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6070 const Mat* arrays[] = { &src, &dst };
6072 NAryMatIterator it(arrays, ptrs, 2);
6073 size_t j, planesz = it.size;
6075 for( j = 0; j < it.nplanes; j++, ++it )
6076 memcpy(ptrs[1], ptrs[0], planesz);
6078 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6080 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6081 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6083 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6092 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6093 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6094 dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6095 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6096 (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6098 else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6100 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6101 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6102 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6103 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6104 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6105 uchar* ptr = alignedPtr.getAlignedPtr();
6107 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6108 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6109 total = std::min(total, u->size - new_dstrawofs);
6110 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6111 (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6112 (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6113 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6114 new_dstrawofs, total, ptr, 0, 0, 0));
6115 for( size_t i = 0; i < new_sz[1]; i++ )
6116 memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6117 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6118 new_dstrawofs, total, ptr, 0, 0, 0));
6122 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6123 uchar* ptr = alignedPtr.getAlignedPtr();
6125 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6126 new_dstofs, new_srcofs, new_sz,
6132 u->markHostCopyObsolete(true);
6133 #ifdef HAVE_OPENCL_SVM
6134 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6135 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6142 u->markHostCopyObsolete(true);
6144 u->markDeviceCopyObsolete(false);
6147 void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6148 const size_t srcofs[], const size_t srcstep[],
6149 const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6154 size_t total = 0, new_sz[] = {0, 0, 0};
6155 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6156 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6158 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6160 srcrawofs, new_srcofs, new_srcstep,
6161 dstrawofs, new_dstofs, new_dststep);
6163 UMatDataAutoLock src_autolock(src, dst);
6165 if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6167 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6170 if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6172 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6173 dst->markHostCopyObsolete(false);
6174 #ifdef HAVE_OPENCL_SVM
6175 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6176 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6183 dst->markDeviceCopyObsolete(true);
6188 // there should be no user-visible CPU copies of the UMat which we are going to copy to
6189 CV_Assert(dst->refcount == 0);
6190 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6192 cl_int retval = CL_SUCCESS;
6193 #ifdef HAVE_OPENCL_SVM
6194 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6195 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6197 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6198 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6200 Context& ctx = Context::getDefault();
6201 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6202 CV_DbgAssert(svmFns->isValid());
6206 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6207 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6208 cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6209 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6210 total, 0, NULL, NULL);
6211 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6216 // This code is from MatAllocator::download()/upload()
6217 int isz[CV_MAX_DIM];
6218 uchar* srcptr = (uchar*)src->handle;
6219 for( int i = 0; i < dims; i++ )
6221 CV_Assert( sz[i] <= (size_t)INT_MAX );
6225 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6226 isz[i] = (int)sz[i];
6228 Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6230 uchar* dstptr = (uchar*)dst->handle;
6231 for( int i = 0; i < dims; i++ )
6234 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6236 Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6238 const Mat* arrays[] = { &m_src, &m_dst };
6240 NAryMatIterator it(arrays, ptrs, 2);
6241 size_t j, planesz = it.size;
6243 for( j = 0; j < it.nplanes; j++, ++it )
6244 memcpy(ptrs[1], ptrs[0], planesz);
6249 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6251 map(src, ACCESS_READ);
6252 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6257 map(dst, ACCESS_WRITE);
6258 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6268 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6269 srcrawofs, dstrawofs, total, 0, 0, 0);
6270 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6271 (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6273 else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6275 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6276 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6277 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6278 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6279 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6281 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6282 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6283 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6284 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6285 uchar* srcptr = srcBuf.getAlignedPtr();
6286 uchar* dstptr = dstBuf.getAlignedPtr();
6288 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6290 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6291 src_total = std::min(src_total, src->size - new_srcrawofs);
6292 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6293 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6295 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6296 new_srcrawofs, src_total, srcptr, 0, 0, 0));
6297 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6298 new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6300 for( size_t i = 0; i < new_sz[1]; i++ )
6301 memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6302 srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6303 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6304 new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6308 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6309 new_srcofs, new_dstofs, new_sz,
6315 if (retval == CL_SUCCESS)
6317 CV_IMPL_ADD(CV_IMPL_OCL)
6320 #ifdef HAVE_OPENCL_SVM
6321 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6322 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6329 dst->markHostCopyObsolete(true);
6331 dst->markDeviceCopyObsolete(false);
6335 CV_OCL_DBG_CHECK(clFinish(q));
6339 BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6341 ocl::Context ctx = Context::getDefault();
6344 #ifdef HAVE_OPENCL_SVM
6345 if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6347 return &ctx.getImpl()->getBufferPoolSVM();
6350 if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6352 return &ctx.getImpl()->getBufferPoolHostPtr();
6354 if (id != NULL && strcmp(id, "OCL") != 0)
6356 CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6358 return &ctx.getImpl()->getBufferPool();
6361 MatAllocator* matStdAllocator;
6363 mutable cv::Mutex cleanupQueueMutex;
6364 mutable std::deque<UMatData*> cleanupQueue;
6366 void flushCleanupQueue() const
6368 if (!cleanupQueue.empty())
6370 std::deque<UMatData*> q;
6372 cv::AutoLock lock(cleanupQueueMutex);
6373 q.swap(cleanupQueue);
6375 for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6381 void addToCleanupQueue(UMatData* u) const
6383 //TODO: Validation check: CV_Assert(!u->tempUMat());
6385 cv::AutoLock lock(cleanupQueueMutex);
6386 cleanupQueue.push_back(u);
6391 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6393 static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6396 MatAllocator* getOpenCLAllocator()
6398 CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6401 }} // namespace cv::ocl
6406 // three funcs below are implemented in umatrix.cpp
6407 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6408 bool autoSteps = false );
6409 void finalizeHdr(UMat& m);
6414 namespace cv { namespace ocl {
6417 // Convert OpenCL buffer memory to UMat
6419 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6422 int sizes[] = { rows, cols };
6424 CV_Assert(0 <= d && d <= CV_MAX_DIM);
6428 dst.flags = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6429 dst.usageFlags = USAGE_DEFAULT;
6431 setSize(dst, d, sizes, 0, true);
6434 cl_mem memobj = (cl_mem)cl_mem_buffer;
6435 cl_mem_object_type mem_type = 0;
6437 CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6439 CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6442 CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6444 CV_OCL_CHECK(clRetainMemObject(memobj));
6446 CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6447 CV_Assert(total >= rows * step);
6449 // attach clBuffer to UMatData
6450 dst.u = new UMatData(getOpenCLAllocator());
6452 dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER; // not allocated from any OpenCV buffer pool
6453 dst.u->flags = static_cast<UMatData::MemoryFlag>(0);
6454 dst.u->handle = cl_mem_buffer;
6455 dst.u->origdata = 0;
6456 dst.u->prevAllocator = 0;
6457 dst.u->size = total;
6463 } // convertFromBuffer()
6467 // Convert OpenCL image2d_t memory to UMat
6469 void convertFromImage(void* cl_mem_image, UMat& dst)
6471 cl_mem clImage = (cl_mem)cl_mem_image;
6472 cl_mem_object_type mem_type = 0;
6474 CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6476 CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6478 cl_image_format fmt = { 0, 0 };
6479 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6482 switch (fmt.image_channel_data_type)
6485 case CL_UNSIGNED_INT8:
6490 case CL_SIGNED_INT8:
6494 case CL_UNORM_INT16:
6495 case CL_UNSIGNED_INT16:
6499 case CL_SNORM_INT16:
6500 case CL_SIGNED_INT16:
6504 case CL_SIGNED_INT32:
6513 CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6517 switch (fmt.image_channel_order)
6520 type = CV_MAKE_TYPE(depth, 1);
6526 type = CV_MAKE_TYPE(depth, 4);
6530 CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6535 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6538 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6541 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6543 dst.create((int)h, (int)w, type);
6545 cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6547 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6550 size_t src_origin[3] = { 0, 0, 0 };
6551 size_t region[3] = { w, h, 1 };
6552 CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6554 CV_OCL_CHECK(clFinish(q));
6557 } // convertFromImage()
6560 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6562 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6564 cl_uint numDevices = 0;
6565 cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6566 if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6568 CV_OCL_DBG_CHECK_RESULT(status,
6569 cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6572 if (numDevices == 0)
6578 devices.resize((size_t)numDevices);
6579 CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6582 struct PlatformInfo::Impl
6587 handle = *(cl_platform_id*)id;
6588 getDevices(devices, handle);
6590 version_ = getStrProp(CL_PLATFORM_VERSION);
6591 parseOpenCLVersion(version_, versionMajor_, versionMinor_);
6594 String getStrProp(cl_platform_info prop) const
6598 return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6599 sz < sizeof(buf) ? String(buf) : String();
6602 IMPLEMENT_REFCOUNTABLE();
6603 std::vector<cl_device_id> devices;
6604 cl_platform_id handle;
6611 PlatformInfo::PlatformInfo()
6616 PlatformInfo::PlatformInfo(void* platform_id)
6618 p = new Impl(platform_id);
6621 PlatformInfo::~PlatformInfo()
6627 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6634 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6647 int PlatformInfo::deviceNumber() const
6649 return p ? (int)p->devices.size() : 0;
6652 void PlatformInfo::getDevice(Device& device, int d) const
6654 CV_Assert(p && d < (int)p->devices.size() );
6656 device.set(p->devices[d]);
6659 String PlatformInfo::name() const
6661 return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6664 String PlatformInfo::vendor() const
6666 return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6669 String PlatformInfo::version() const
6671 return p ? p->version_ : String();
6674 int PlatformInfo::versionMajor() const
6677 return p->versionMajor_;
6680 int PlatformInfo::versionMinor() const
6683 return p->versionMinor_;
6686 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6688 cl_uint numPlatforms = 0;
6689 CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6691 if (numPlatforms == 0)
6697 platforms.resize((size_t)numPlatforms);
6698 CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6701 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6703 std::vector<cl_platform_id> platforms;
6704 getPlatforms(platforms);
6706 for (size_t i = 0; i < platforms.size(); i++)
6707 platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6710 const char* typeToStr(int type)
6712 static const char* tab[]=
6714 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6715 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6716 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6717 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6718 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6719 "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6720 "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6721 "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6724 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6725 const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6730 const char* memopTypeToStr(int type)
6732 static const char* tab[] =
6734 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6735 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6736 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6737 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6738 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6739 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6740 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6741 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6742 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6744 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6745 const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6750 const char* vecopTypeToStr(int type)
6752 static const char* tab[] =
6754 "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6755 "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6756 "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6757 "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6758 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6759 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6760 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6761 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6762 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6764 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6765 const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6770 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6772 if( sdepth == ddepth )
6774 const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6775 if( ddepth >= CV_32F ||
6776 (ddepth == CV_32S && sdepth < CV_32S) ||
6777 (ddepth == CV_16S && sdepth <= CV_8S) ||
6778 (ddepth == CV_16U && sdepth == CV_8U))
6780 sprintf(buf, "convert_%s", typestr);
6782 else if( sdepth >= CV_32F )
6783 sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6785 sprintf(buf, "convert_%s_sat", typestr);
6790 const char* getOpenCLErrorString(int errorCode)
6792 #define CV_OCL_CODE(id) case id: return #id
6793 #define CV_OCL_CODE_(id, name) case id: return #name
6796 CV_OCL_CODE(CL_SUCCESS);
6797 CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6798 CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6799 CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6800 CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6801 CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6802 CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6803 CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6804 CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6805 CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6806 CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6807 CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6808 CV_OCL_CODE(CL_MAP_FAILURE);
6809 CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6810 CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6811 CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6812 CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6813 CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6814 CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6815 CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6816 CV_OCL_CODE(CL_INVALID_VALUE);
6817 CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6818 CV_OCL_CODE(CL_INVALID_PLATFORM);
6819 CV_OCL_CODE(CL_INVALID_DEVICE);
6820 CV_OCL_CODE(CL_INVALID_CONTEXT);
6821 CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6822 CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6823 CV_OCL_CODE(CL_INVALID_HOST_PTR);
6824 CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6825 CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6826 CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6827 CV_OCL_CODE(CL_INVALID_SAMPLER);
6828 CV_OCL_CODE(CL_INVALID_BINARY);
6829 CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6830 CV_OCL_CODE(CL_INVALID_PROGRAM);
6831 CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6832 CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6833 CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6834 CV_OCL_CODE(CL_INVALID_KERNEL);
6835 CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6836 CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6837 CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6838 CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6839 CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6840 CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6841 CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6842 CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6843 CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6844 CV_OCL_CODE(CL_INVALID_EVENT);
6845 CV_OCL_CODE(CL_INVALID_OPERATION);
6846 CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6847 CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6848 CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6849 CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6851 CV_OCL_CODE(CL_INVALID_PROPERTY);
6853 CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6854 CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6855 CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6856 CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6858 CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6859 CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6861 CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6862 CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6863 CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6864 CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6865 CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6866 CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6867 default: return "Unknown OpenCL error";
6873 template <typename T>
6874 static std::string kerToStr(const Mat & k)
6876 int width = k.cols - 1, depth = k.depth();
6877 const T * const data = k.ptr<T>();
6879 std::ostringstream stream;
6880 stream.precision(10);
6884 for (int i = 0; i < width; ++i)
6885 stream << "DIG(" << (int)data[i] << ")";
6886 stream << "DIG(" << (int)data[width] << ")";
6888 else if (depth == CV_32F)
6890 stream.setf(std::ios_base::showpoint);
6891 for (int i = 0; i < width; ++i)
6892 stream << "DIG(" << data[i] << "f)";
6893 stream << "DIG(" << data[width] << "f)";
6897 for (int i = 0; i < width; ++i)
6898 stream << "DIG(" << data[i] << ")";
6899 stream << "DIG(" << data[width] << ")";
6902 return stream.str();
6905 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6907 Mat kernel = _kernel.getMat().reshape(1, 1);
6909 int depth = kernel.depth();
6913 if (ddepth != depth)
6914 kernel.convertTo(kernel, ddepth);
6916 typedef std::string (* func_t)(const Mat &);
6917 static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6918 kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6919 const func_t func = funcs[ddepth];
6920 CV_Assert(func != 0);
6922 return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6925 #define PROCESS_SRC(src) \
6930 CV_Assert(src.isMat() || src.isUMat()); \
6931 Size csize = src.size(); \
6932 int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6933 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6934 if (cwidth < ckercn || ckercn <= 0) \
6936 cols.push_back(cwidth); \
6937 if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6939 offsets.push_back(src.offset()); \
6940 steps.push_back(src.step()); \
6941 dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6942 kercns.push_back(ckercn); \
6947 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6948 InputArray src4, InputArray src5, InputArray src6,
6949 InputArray src7, InputArray src8, InputArray src9,
6950 OclVectorStrategy strat)
6952 const ocl::Device & d = ocl::Device::getDefault();
6954 int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6955 d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6956 d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6957 d.preferredVectorWidthDouble(), -1 };
6959 // if the device says don't use vectors
6960 if (vectorWidths[0] == 1)
6963 vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6964 vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6965 vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6968 return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6971 int checkOptimalVectorWidth(const int *vectorWidths,
6972 InputArray src1, InputArray src2, InputArray src3,
6973 InputArray src4, InputArray src5, InputArray src6,
6974 InputArray src7, InputArray src8, InputArray src9,
6975 OclVectorStrategy strat)
6977 CV_Assert(vectorWidths);
6979 int ref_type = src1.type();
6981 std::vector<size_t> offsets, steps, cols;
6982 std::vector<int> dividers, kercns;
6993 size_t size = offsets.size();
6995 for (size_t i = 0; i < size; ++i)
6996 while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6997 dividers[i] >>= 1, kercns[i] >>= 1;
7000 int kercn = *std::min_element(kercns.begin(), kercns.end());
7005 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
7006 InputArray src4, InputArray src5, InputArray src6,
7007 InputArray src7, InputArray src8, InputArray src9)
7009 return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
7015 // TODO Make this as a method of OpenCL "BuildOptions" class
7016 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
7018 if (!buildOptions.empty())
7019 buildOptions += " ";
7020 int type = _m.type(), depth = CV_MAT_DEPTH(type);
7021 buildOptions += format(
7022 "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
7023 name.c_str(), ocl::typeToStr(type),
7024 name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
7025 name.c_str(), (int)CV_MAT_CN(type),
7026 name.c_str(), (int)CV_ELEM_SIZE(type),
7027 name.c_str(), (int)CV_ELEM_SIZE1(type),
7028 name.c_str(), (int)depth
7033 struct Image2D::Impl
7035 Impl(const UMat &src, bool norm, bool alias)
7039 init(src, norm, alias);
7045 clReleaseMemObject(handle);
7048 static cl_image_format getImageFormat(int depth, int cn, bool norm)
7050 cl_image_format format;
7051 static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
7052 CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
7053 static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
7054 CL_SNORM_INT16, -1, -1, -1, -1 };
7055 static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
7057 int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
7058 int channelOrder = channelOrders[cn];
7059 format.image_channel_data_type = (cl_channel_type)channelType;
7060 format.image_channel_order = (cl_channel_order)channelOrder;
7064 static bool isFormatSupported(cl_image_format format)
7067 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7069 cl_context context = (cl_context)Context::getDefault().ptr();
7073 // Figure out how many formats are supported by this context.
7074 cl_uint numFormats = 0;
7075 cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7076 CL_MEM_OBJECT_IMAGE2D, numFormats,
7078 CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7081 AutoBuffer<cl_image_format> formats(numFormats);
7082 err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7083 CL_MEM_OBJECT_IMAGE2D, numFormats,
7084 formats.data(), NULL);
7085 CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7086 for (cl_uint i = 0; i < numFormats; ++i)
7088 if (!memcmp(&formats[i], &format, sizeof(format)))
7097 void init(const UMat &src, bool norm, bool alias)
7100 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7102 CV_Assert(!src.empty());
7103 CV_Assert(ocl::Device::getDefault().imageSupport());
7105 int err, depth = src.depth(), cn = src.channels();
7107 cl_image_format format = getImageFormat(depth, cn, norm);
7109 if (!isFormatSupported(format))
7110 CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7112 if (alias && !src.handle(ACCESS_RW))
7113 CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7115 cl_context context = (cl_context)Context::getDefault().ptr();
7116 cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7118 #ifdef CL_VERSION_1_2
7119 // this enables backwards portability to
7120 // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7121 const Device & d = ocl::Device::getDefault();
7122 int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7123 CV_Assert(!alias || canCreateAlias(src));
7124 if (1 < major || (1 == major && 2 <= minor))
7127 desc.image_type = CL_MEM_OBJECT_IMAGE2D;
7128 desc.image_width = src.cols;
7129 desc.image_height = src.rows;
7130 desc.image_depth = 0;
7131 desc.image_array_size = 1;
7132 desc.image_row_pitch = alias ? src.step[0] : 0;
7133 desc.image_slice_pitch = 0;
7134 desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7135 desc.num_mip_levels = 0;
7136 desc.num_samples = 0;
7137 handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7142 CV_SUPPRESS_DEPRECATED_START
7143 CV_Assert(!alias); // This is an OpenCL 1.2 extension
7144 handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7145 CV_SUPPRESS_DEPRECATED_END
7147 CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7149 size_t origin[] = { 0, 0, 0 };
7150 size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7153 if (!alias && !src.isContinuous())
7155 devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7156 CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7157 (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7160 const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7161 CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7162 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7163 CV_OCL_DBG_CHECK(clFlush(queue));
7167 devData = (cl_mem)src.handle(ACCESS_READ);
7169 CV_Assert(devData != NULL);
7173 CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7174 if (!src.isContinuous())
7176 CV_OCL_DBG_CHECK(clFlush(queue));
7177 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7182 IMPLEMENT_REFCOUNTABLE();
7192 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7194 p = new Impl(src, norm, alias);
7197 bool Image2D::canCreateAlias(const UMat &m)
7200 const Device & d = ocl::Device::getDefault();
7201 if (d.imageFromBufferSupport() && !m.empty())
7203 // This is the required pitch alignment in pixels
7204 uint pitchAlign = d.imagePitchAlignment();
7205 if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7207 // We don't currently handle the case where the buffer was created
7208 // with CL_MEM_USE_HOST_PTR
7209 if (!m.u->tempUMat())
7218 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7220 cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7222 return Impl::isFormatSupported(format);
7225 Image2D::Image2D(const Image2D & i)
7232 Image2D & Image2D::operator = (const Image2D & i)
7251 void* Image2D::ptr() const
7253 return p ? p->handle : 0;
7256 bool internal::isOpenCLForced()
7258 static bool initialized = false;
7259 static bool value = false;
7262 value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7268 bool internal::isPerformanceCheckBypassed()
7270 static bool initialized = false;
7271 static bool value = false;
7274 value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7280 bool internal::isCLBuffer(UMat& u)
7282 void* h = u.handle(ACCESS_RW);
7285 CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7287 if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7290 cl_mem_object_type type = 0;
7291 cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7292 if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7302 Impl(const Queue& q)
7311 CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7317 CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7321 uint64 durationNS() const
7323 return (uint64)(timer.getTimeSec() * 1e9);
7329 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7330 Timer::~Timer() { delete p; }
7344 uint64 Timer::durationNS() const
7347 return p->durationNS();
7353 namespace cv { namespace directx { namespace internal {
7354 OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
7356 ocl::Context::Impl* i = ctx.getImpl();
7358 return i->getDirectXImpl();
7360 }}} // namespace cv::directx::internal
7363 #endif // HAVE_OPENCL