1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
19 // * Redistribution's of source code must retain the above copyright notice,
20 // this list of conditions and the following disclaimer.
22 // * Redistribution's in binary form must reproduce the above copyright notice,
23 // this list of conditions and the following disclaimer in the documentation
24 // and/or other materials provided with the distribution.
26 // * The name of the copyright holders may not be used to endorse or promote products
27 // derived from this software without specific prior written permission.
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
42 #include "precomp.hpp"
45 #include "ocl_disabled.impl.hpp"
54 #include <iostream> // std::cerr
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
60 #include <opencv2/core/utils/configuration.private.hpp>
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
80 #define CV_OPENCL_SHOW_RUN_KERNELS 0
81 #define CV_OPENCL_TRACE_CHECK 0
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS 1
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG 1
86 #define CV_OPENCL_SHOW_SVM_LOG 0
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
91 # define LOG_BUFFER_POOL printf
93 # define LOG_BUFFER_POOL(...)
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
101 #define CV_OPENCL_SVM_TRACE_P(...)
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
117 #include "directx.hpp"
120 #ifdef HAVE_OPENCL_SVM
121 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
122 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
123 #include "opencv2/core/opencl/opencl_svm.hpp"
126 #include "umatrix.hpp"
128 namespace cv { namespace ocl {
130 #define IMPLEMENT_REFCOUNTABLE() \
131 void addref() { CV_XADD(&refcount, 1); } \
132 void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
135 static cv::utils::AllocatorStatistics opencl_allocator_stats;
137 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
138 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
140 return opencl_allocator_stats;
144 static bool isRaiseError()
146 static bool initialized = false;
147 static bool value = false;
150 value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
157 #if CV_OPENCL_TRACE_CHECK
159 void traceOpenCLCheck(cl_int status, const char* message)
161 std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
163 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
165 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
168 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
169 cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
171 #define CV_OCL_CHECK_RESULT(check_result, msg) \
173 CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
174 if (check_result != CL_SUCCESS) \
176 static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
177 cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
178 CV_Error(Error::OpenCLApiCallError, error_msg); \
182 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
184 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
188 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
189 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
191 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
193 CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
194 if (check_result != CL_SUCCESS && isRaiseError()) \
196 static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
197 cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
198 CV_Error(Error::OpenCLApiCallError, error_msg); \
201 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
202 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
206 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
207 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
208 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
209 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
211 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
212 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
215 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
216 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
224 static const String getBuildExtraOptions()
226 static String param_buildExtraOptions;
227 static bool initialized = false;
230 param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
232 if (!param_buildExtraOptions.empty())
233 CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
235 return param_buildExtraOptions;
238 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
239 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
244 UMat2D(const UMat& m)
246 offset = (int)m.offset;
259 UMat3D(const UMat& m)
261 offset = (int)m.offset;
262 step = (int)m.step.p[1];
263 slicestep = (int)m.step.p[0];
264 slices = (int)m.size.p[0];
276 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
277 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
279 static uint64 table[256];
280 static bool initialized = false;
284 for( int i = 0; i < 256; i++ )
287 for( int j = 0; j < 8; j++ )
288 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
295 for( size_t idx = 0; idx < size; idx++ )
296 crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
301 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
302 struct OpenCLBinaryCacheConfigurator
304 cv::String cache_path_;
305 cv::String cache_lock_filename_;
306 cv::Ptr<utils::fs::FileLock> cache_lock_;
308 typedef std::map<std::string, std::string> ContextCacheType;
309 ContextCacheType prepared_contexts_;
310 Mutex mutex_prepared_contexts_;
312 OpenCLBinaryCacheConfigurator()
314 CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
315 if (!CV_OPENCL_CACHE_ENABLE)
317 CV_LOG_INFO(NULL, "OpenCL cache is disabled");
320 cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
321 if (cache_path_.empty())
323 CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
329 if (cache_path_.empty())
331 if (cache_path_ == "disabled")
333 if (!utils::fs::createDirectories(cache_path_))
335 CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
340 if (CV_OPENCL_CACHE_LOCK_ENABLE)
342 cache_lock_filename_ = cache_path_ + ".lock";
343 if (!utils::fs::exists(cache_lock_filename_))
345 CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
346 std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
347 if (!lock_filename.is_open())
349 CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
356 cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
357 CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
359 utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
361 CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
363 catch (const cv::Exception& e)
365 CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
369 CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
374 if (CV_OPENCL_CACHE_WRITE)
376 CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
377 "(not safe for multiprocess environment)");
381 CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
385 catch (const cv::Exception& e)
387 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
391 if (!cache_path_.empty())
393 if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
395 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
396 "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
400 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
408 cache_lock_filename_.clear();
409 cache_lock_.release();
412 std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
413 const std::string& cleanup_prefix)
415 if (cache_path_.empty())
416 return std::string();
418 AutoLock lock(mutex_prepared_contexts_);
420 ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
421 if (found_it != prepared_contexts_.end())
422 return found_it->second;
424 CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
426 std::string target_directory = cache_path_ + ctx_prefix + "/";
427 bool result = utils::fs::isDirectory(target_directory);
432 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
433 if (utils::fs::createDirectories(target_directory))
439 CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
442 catch (const cv::Exception& e)
444 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
447 target_directory = result ? target_directory : std::string();
448 prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
450 if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
454 std::vector<String> entries;
455 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
456 std::vector<String> remove_entries;
457 for (size_t i = 0; i < entries.size(); i++)
459 const String& name = entries[i];
460 if (0 == name.find(cleanup_prefix))
462 if (0 == name.find(ctx_prefix))
463 continue; // skip current
464 remove_entries.push_back(name);
467 if (!remove_entries.empty())
469 CV_LOG_WARNING(NULL, (remove_entries.size() == 1
470 ? "Detected OpenCL cache directory for other version of OpenCL device."
471 : "Detected OpenCL cache directories for other versions of OpenCL device.")
472 << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
473 CV_LOG_WARNING(NULL, "Trying to remove these directories...");
474 for (size_t i = 0; i < remove_entries.size(); i++)
476 CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
478 CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
480 for (size_t i = 0; i < remove_entries.size(); i++)
482 const String& name = remove_entries[i];
483 cv::String path = utils::fs::join(cache_path_, name);
486 utils::fs::remove_all(path);
487 CV_LOG_WARNING(NULL, "Removed: " << path);
489 catch (const cv::Exception& e)
491 CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
498 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
502 CV_LOG_VERBOSE(NULL, 1, " Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
503 return target_directory;
506 static OpenCLBinaryCacheConfigurator& getSingletonInstance()
508 CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
511 class BinaryProgramFile
513 enum { MAX_ENTRIES = 64 };
515 typedef unsigned int uint32_t;
517 struct CV_DECL_ALIGNED(4) FileHeader
519 uint32_t sourceSignatureSize;
520 //char sourceSignature[];
523 struct CV_DECL_ALIGNED(4) FileTable
525 uint32_t numberOfEntries;
526 //uint32_t firstEntryOffset[];
529 struct CV_DECL_ALIGNED(4) FileEntry
531 uint32_t nextEntryFileOffset; // 0 for the last entry in chain
538 const std::string fileName_;
539 const char* const sourceSignature_;
540 const size_t sourceSignatureSize_;
544 uint32_t entryOffsets[MAX_ENTRIES];
546 uint32_t getHash(const std::string& options)
548 uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
549 return hash & (MAX_ENTRIES - 1);
552 inline size_t getFileSize()
554 size_t pos = (size_t)f.tellg();
555 f.seekg(0, std::fstream::end);
556 size_t fileSize = (size_t)f.tellg();
557 f.seekg(pos, std::fstream::beg);
560 inline uint32_t readUInt32()
563 f.read((char*)&res, sizeof(uint32_t));
564 CV_Assert(!f.fail());
567 inline void writeUInt32(const uint32_t value)
570 f.write((char*)&v, sizeof(uint32_t));
571 CV_Assert(!f.fail());
574 inline void seekReadAbsolute(size_t pos)
576 f.seekg(pos, std::fstream::beg);
577 CV_Assert(!f.fail());
579 inline void seekReadRelative(size_t pos)
581 f.seekg(pos, std::fstream::cur);
582 CV_Assert(!f.fail());
585 inline void seekWriteAbsolute(size_t pos)
587 f.seekp(pos, std::fstream::beg);
588 CV_Assert(!f.fail());
594 if (0 != remove(fileName_.c_str()))
595 CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
600 BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
601 : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
603 CV_StaticAssert(sizeof(uint32_t) == 4, "");
604 CV_Assert(sourceSignature_ != NULL);
605 CV_Assert(sourceSignatureSize_ > 0);
606 memset(entryOffsets, 0, sizeof(entryOffsets));
608 f.rdbuf()->pubsetbuf(0, 0); // disable buffering
609 f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
610 if(f.is_open() && getFileSize() > 0)
612 bool isValid = false;
615 uint32_t fileSourceSignatureSize = readUInt32();
616 if (fileSourceSignatureSize == sourceSignatureSize_)
618 cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
619 f.read(fileSourceSignature.data(), fileSourceSignatureSize);
622 CV_LOG_ERROR(NULL, "Unexpected EOF");
624 else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
631 CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
634 catch (const cv::Exception& e)
636 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
640 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
653 bool read(const std::string& key, std::vector<char>& buf)
658 size_t fileSize = getFileSize();
661 CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
668 uint32_t fileSourceSignatureSize = readUInt32();
669 CV_Assert(fileSourceSignatureSize > 0);
670 seekReadRelative(fileSourceSignatureSize);
672 uint32_t numberOfEntries = readUInt32();
673 CV_Assert(numberOfEntries > 0);
674 if (numberOfEntries != MAX_ENTRIES)
676 CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
680 f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
681 CV_Assert(!f.fail());
683 uint32_t entryNum = getHash(key);
685 uint32_t entryOffset = entryOffsets[entryNum];
687 while (entryOffset > 0)
689 seekReadAbsolute(entryOffset);
690 //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
691 f.read((char*)&entry, sizeof(entry));
692 CV_Assert(!f.fail());
693 cv::AutoBuffer<char> fileKey(entry.keySize + 1);
694 if (key.size() == entry.keySize)
696 if (entry.keySize > 0)
698 f.read(fileKey.data(), entry.keySize);
699 CV_Assert(!f.fail());
701 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
703 buf.resize(entry.dataSize);
704 f.read(&buf[0], entry.dataSize);
705 CV_Assert(!f.fail());
707 CV_LOG_VERBOSE(NULL, 0, "Read...");
711 if (entry.nextEntryFileOffset == 0)
713 entryOffset = entry.nextEntryFileOffset;
718 bool write(const std::string& key, std::vector<char>& buf)
722 f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
725 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
728 CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
734 size_t fileSize = getFileSize();
738 seekWriteAbsolute(0);
739 writeUInt32((uint32_t)sourceSignatureSize_);
740 f.write(sourceSignature_, sourceSignatureSize_);
741 CV_Assert(!f.fail());
743 writeUInt32(MAX_ENTRIES);
744 memset(entryOffsets, 0, sizeof(entryOffsets));
745 f.write((char*)entryOffsets, sizeof(entryOffsets));
746 CV_Assert(!f.fail());
748 CV_Assert(!f.fail());
750 f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
751 CV_Assert(f.is_open());
752 fileSize = getFileSize();
757 uint32_t fileSourceSignatureSize = readUInt32();
758 CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
759 seekReadRelative(fileSourceSignatureSize);
761 uint32_t numberOfEntries = readUInt32();
762 CV_Assert(numberOfEntries > 0);
763 if (numberOfEntries != MAX_ENTRIES)
765 CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
769 size_t tableEntriesOffset = (size_t)f.tellg();
770 f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
771 CV_Assert(!f.fail());
773 uint32_t entryNum = getHash(key);
775 uint32_t entryOffset = entryOffsets[entryNum];
777 while (entryOffset > 0)
779 seekReadAbsolute(entryOffset);
780 //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
781 f.read((char*)&entry, sizeof(entry));
782 CV_Assert(!f.fail());
783 cv::AutoBuffer<char> fileKey(entry.keySize + 1);
784 if (key.size() == entry.keySize)
786 if (entry.keySize > 0)
788 f.read(fileKey.data(), entry.keySize);
789 CV_Assert(!f.fail());
791 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
794 CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
798 if (entry.nextEntryFileOffset == 0)
800 entryOffset = entry.nextEntryFileOffset;
805 seekWriteAbsolute(entryOffset);
806 entry.nextEntryFileOffset = (uint32_t)fileSize;
807 f.write((char*)&entry, sizeof(entry));
808 CV_Assert(!f.fail());
812 entryOffsets[entryNum] = (uint32_t)fileSize;
813 seekWriteAbsolute(tableEntriesOffset);
814 f.write((char*)entryOffsets, sizeof(entryOffsets));
815 CV_Assert(!f.fail());
817 seekWriteAbsolute(fileSize);
818 entry.nextEntryFileOffset = 0;
819 entry.dataSize = (uint32_t)buf.size();
820 entry.keySize = (uint32_t)key.size();
821 f.write((char*)&entry, sizeof(entry));
822 CV_Assert(!f.fail());
823 f.write(key.c_str(), entry.keySize);
824 CV_Assert(!f.fail());
825 f.write(&buf[0], entry.dataSize);
826 CV_Assert(!f.fail());
828 CV_Assert(!f.fail());
829 CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
833 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
837 struct OpenCLExecutionContext::Impl
839 ocl::Context context_;
840 int device_; // device index in context
847 void _init_device(cl_device_id deviceID)
850 int ndevices = (int)context_.ndevices();
851 CV_Assert(ndevices > 0);
853 for (int i = 0; i < ndevices; i++)
855 ocl::Device d = context_.device(i);
856 cl_device_id dhandle = (cl_device_id)d.ptr();
857 if (dhandle == deviceID)
864 CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
867 void _init_device(const ocl::Device& device)
869 CV_Assert(device.ptr());
870 int ndevices = (int)context_.ndevices();
871 CV_Assert(ndevices > 0);
873 for (int i = 0; i < ndevices; i++)
875 ocl::Device d = context_.device(i);
876 if (d.getImpl() == device.getImpl())
883 CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
887 Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
888 : device_(0), useOpenCL_(-1)
890 CV_UNUSED(platformID);
894 context_ = Context::fromHandle(context);
895 _init_device(deviceID);
896 queue_ = Queue(context_, context_.device(device_));
899 Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
900 : device_(0), useOpenCL_(-1)
902 CV_Assert(context.ptr());
903 CV_Assert(device.ptr());
906 _init_device(device);
910 Impl(const ocl::Context& context, const ocl::Device& device)
911 : device_(0), useOpenCL_(-1)
913 CV_Assert(context.ptr());
914 CV_Assert(device.ptr());
917 _init_device(device);
918 queue_ = Queue(context_, context_.device(device_));
921 Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
929 Impl(const Impl& other)
930 : context_(other.context_)
931 , device_(other.device_)
932 , queue_(other.queue_)
938 inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
946 if (!context_.empty() && context_.ndevices() > 0)
948 const Device& d = context_.device(device_);
949 useOpenCL_ = d.available();
952 catch (const cv::Exception&)
957 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
959 return useOpenCL_ > 0;
962 void setUseOpenCL(bool flag)
970 static const std::shared_ptr<Impl>& getInitializedExecutionContext()
974 CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
976 static bool initialized = false;
977 static std::shared_ptr<Impl> g_primaryExecutionContext;
981 cv::AutoLock lock(getInitializationMutex());
984 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
987 Context c = ocl::Context::create(std::string());
991 auto& d = c.device(deviceId);
994 auto q = ocl::Queue(c, d);
997 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
1001 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
1002 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
1007 CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1012 CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1015 catch (const std::exception& e)
1017 CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1021 CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1026 return g_primaryExecutionContext;
1030 Context& OpenCLExecutionContext::getContext() const
1035 Device& OpenCLExecutionContext::getDevice() const
1038 return p->context_.device(p->device_);
1040 Queue& OpenCLExecutionContext::getQueue() const
1046 bool OpenCLExecutionContext::useOpenCL() const
1049 return p->useOpenCL();
1052 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1055 p->setUseOpenCL(flag);
1059 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1061 CV_TRACE_FUNCTION();
1062 CoreTLSData& data = getCoreTlsData();
1063 OpenCLExecutionContext& c = data.oclExecutionContext;
1064 if (!data.oclExecutionContextInitialized)
1066 data.oclExecutionContextInitialized = true;
1067 if (c.empty() && haveOpenCL())
1068 c.p = Impl::getInitializedExecutionContext();
1074 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1076 CV_TRACE_FUNCTION();
1077 CoreTLSData& data = getCoreTlsData();
1078 OpenCLExecutionContext& c = data.oclExecutionContext;
1082 void OpenCLExecutionContext::bind() const
1084 CV_TRACE_FUNCTION();
1086 CoreTLSData& data = getCoreTlsData();
1087 data.oclExecutionContext = *this;
1088 data.oclExecutionContextInitialized = true;
1089 data.useOpenCL = p->useOpenCL_; // propagate "-1", avoid call useOpenCL()
1093 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1095 CV_TRACE_FUNCTION();
1097 const Queue q(getContext(), getDevice());
1098 return cloneWithNewQueue(q);
1101 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1103 CV_TRACE_FUNCTION();
1105 CV_Assert(q.ptr() != NULL);
1106 OpenCLExecutionContext c;
1107 c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1112 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1114 CV_TRACE_FUNCTION();
1116 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1118 CV_Assert(!context.empty());
1119 CV_Assert(context.ptr());
1120 CV_Assert(!device.empty());
1121 CV_Assert(device.ptr());
1122 OpenCLExecutionContext ctx;
1123 ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1129 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1131 CV_TRACE_FUNCTION();
1133 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1135 CV_Assert(!context.empty());
1136 CV_Assert(context.ptr());
1137 CV_Assert(!device.empty());
1138 CV_Assert(device.ptr());
1139 OpenCLExecutionContext ctx;
1140 ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1145 void OpenCLExecutionContext::release()
1147 CV_TRACE_FUNCTION();
1152 // true if we have initialized OpenCL subsystem with available platforms
1153 static bool g_isOpenCVActivated = false;
1157 CV_TRACE_FUNCTION();
1158 static bool g_isOpenCLInitialized = false;
1159 static bool g_isOpenCLAvailable = false;
1161 if (!g_isOpenCLInitialized)
1163 CV_TRACE_REGION("Init_OpenCL_Runtime");
1164 const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1167 if (cv::String(envPath) == "disabled")
1169 g_isOpenCLAvailable = false;
1170 g_isOpenCLInitialized = true;
1175 cv::AutoLock lock(getInitializationMutex());
1176 CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1180 g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1181 g_isOpenCVActivated = n > 0;
1182 CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1186 g_isOpenCLAvailable = false;
1188 g_isOpenCLInitialized = true;
1190 return g_isOpenCLAvailable;
1195 CoreTLSData& data = getCoreTlsData();
1196 if (data.useOpenCL < 0)
1203 auto c = OpenCLExecutionContext::getCurrent();
1204 data.useOpenCL = c.useOpenCL();
1209 CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1212 return data.useOpenCL > 0;
1215 bool isOpenCLActivated()
1217 if (!g_isOpenCVActivated)
1218 return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1222 void setUseOpenCL(bool flag)
1224 CV_TRACE_FUNCTION();
1226 CoreTLSData& data = getCoreTlsData();
1227 auto& c = OpenCLExecutionContext::getCurrentRef();
1230 c.setUseOpenCL(flag);
1231 data.useOpenCL = c.useOpenCL();
1238 data.useOpenCL = -1; // enabled by default (if context is not initialized)
1244 #ifdef HAVE_CLAMDBLAS
1249 static AmdBlasHelper & getInstance()
1251 CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1254 bool isAvailable() const
1256 return g_isAmdBlasAvailable;
1263 clAmdBlasTeardown();
1271 if (!g_isAmdBlasInitialized)
1273 AutoLock lock(getInitializationMutex());
1275 if (!g_isAmdBlasInitialized)
1281 g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1285 g_isAmdBlasAvailable = false;
1289 g_isAmdBlasAvailable = false;
1291 g_isAmdBlasInitialized = true;
1297 static bool g_isAmdBlasInitialized;
1298 static bool g_isAmdBlasAvailable;
1301 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1302 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1306 return AmdBlasHelper::getInstance().isAvailable();
1318 #ifdef HAVE_CLAMDFFT
1323 static AmdFftHelper & getInstance()
1325 CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1328 bool isAvailable() const
1330 return g_isAmdFftAvailable;
1337 // clAmdFftTeardown();
1345 if (!g_isAmdFftInitialized)
1347 AutoLock lock(getInitializationMutex());
1349 if (!g_isAmdFftInitialized)
1355 cl_uint major, minor, patch;
1356 CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1358 // it throws exception in case AmdFft binaries are not found
1359 CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1360 g_isAmdFftAvailable = true;
1362 catch (const Exception &)
1364 g_isAmdFftAvailable = false;
1368 g_isAmdFftAvailable = false;
1370 g_isAmdFftInitialized = true;
1376 static clAmdFftSetupData setupData;
1377 static bool g_isAmdFftInitialized;
1378 static bool g_isAmdFftAvailable;
1381 clAmdFftSetupData AmdFftHelper::setupData;
1382 bool AmdFftHelper::g_isAmdFftAvailable = false;
1383 bool AmdFftHelper::g_isAmdFftInitialized = false;
1387 return AmdFftHelper::getInstance().isAvailable();
1401 #ifdef HAVE_OPENCL_SVM
1410 Queue::getDefault().finish();
1413 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1415 struct Platform::Impl
1421 initialized = false;
1430 //cl_uint num_entries
1432 if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1438 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1440 vendor = String(buf);
1447 IMPLEMENT_REFCOUNTABLE();
1449 cl_platform_id handle;
1454 Platform::Platform()
1459 Platform::~Platform()
1465 Platform::Platform(const Platform& pl)
1472 Platform& Platform::operator = (const Platform& pl)
1474 Impl* newp = (Impl*)pl.p;
1483 void* Platform::ptr() const
1485 return p ? p->handle : 0;
1488 Platform& Platform::getDefault()
1490 CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1500 /////////////////////////////////////// Device ////////////////////////////////////////////
1502 // deviceVersion has format
1503 // OpenCL<space><major_version.minor_version><space><vendor-specific information>
1505 // http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1506 // http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1507 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1510 if (10 >= deviceVersion.length())
1512 const char *pstr = deviceVersion.c_str();
1513 if (0 != strncmp(pstr, "OpenCL ", 7))
1515 size_t ppos = deviceVersion.find('.', 7);
1516 if (String::npos == ppos)
1518 String temp = deviceVersion.substr(7, ppos - 7);
1519 major = atoi(temp.c_str());
1520 temp = deviceVersion.substr(ppos + 1);
1521 minor = atoi(temp.c_str());
1532 cl_device_id device = (cl_device_id)d;
1534 CV_OCL_CHECK(clRetainDevice(device)); // increment reference counter on success only
1542 void _init(cl_device_id d)
1544 handle = (cl_device_id)d;
1546 name_ = getStrProp(CL_DEVICE_NAME);
1547 version_ = getStrProp(CL_DEVICE_VERSION);
1548 extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1549 doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1550 hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1551 maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1552 maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1553 type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1554 driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1555 addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1557 String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1558 parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1561 while (pos < extensions_.size())
1563 size_t pos2 = extensions_.find(' ', pos);
1564 if (pos2 == String::npos)
1565 pos2 = extensions_.size();
1568 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1569 extensions_set_.insert(extensionName);
1574 intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1576 vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1577 if (vendorName_ == "Advanced Micro Devices, Inc." ||
1578 vendorName_ == "AMD")
1579 vendorID_ = VENDOR_AMD;
1580 else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1581 vendorID_ = VENDOR_INTEL;
1582 else if (vendorName_ == "NVIDIA Corporation")
1583 vendorID_ = VENDOR_NVIDIA;
1585 vendorID_ = UNKNOWN_VENDOR;
1587 const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1588 if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1590 const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1591 if (new_maxWorkGroupSize != maxWorkGroupSize_)
1592 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1593 maxWorkGroupSize_ = new_maxWorkGroupSize;
1596 if (isExtensionSupported("cl_khr_spir"))
1598 #ifndef CL_DEVICE_SPIR_VERSIONS
1599 #define CL_DEVICE_SPIR_VERSIONS 0x40E0
1601 cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1602 std::cout << spir_versions << std::endl;
1610 if (!cv::__termination)
1615 CV_OCL_CHECK(clReleaseDevice(handle));
1621 template<typename _TpCL, typename _TpOut>
1622 _TpOut getProp(cl_device_info prop) const
1627 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1628 sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1631 bool getBoolProp(cl_device_info prop) const
1633 cl_bool temp = CL_FALSE;
1636 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1637 sz == sizeof(temp) ? temp != 0 : false;
1640 String getStrProp(cl_device_info prop) const
1644 return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1645 sz < sizeof(buf) ? String(buf) : String();
1648 bool isExtensionSupported(const std::string& extensionName) const
1650 return extensions_set_.count(extensionName) > 0;
1654 IMPLEMENT_REFCOUNTABLE();
1656 cl_device_id handle;
1660 std::string extensions_;
1661 int doubleFPConfig_;
1662 bool hostUnifiedMemory_;
1663 int maxComputeUnits_;
1664 size_t maxWorkGroupSize_;
1667 int deviceVersionMajor_;
1668 int deviceVersionMinor_;
1669 String driverVersion_;
1672 bool intelSubgroupsSupport_;
1674 std::set<std::string> extensions_set_;
1683 Device::Device(void* d)
1689 Device::Device(const Device& d)
1696 Device& Device::operator = (const Device& d)
1698 Impl* newp = (Impl*)d.p;
1713 void Device::set(void* d)
1720 CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1724 Device Device::fromHandle(void* d)
1730 void* Device::ptr() const
1732 return p ? p->handle : 0;
1735 String Device::name() const
1736 { return p ? p->name_ : String(); }
1738 String Device::extensions() const
1739 { return p ? String(p->extensions_) : String(); }
1741 bool Device::isExtensionSupported(const String& extensionName) const
1742 { return p ? p->isExtensionSupported(extensionName) : false; }
1744 String Device::version() const
1745 { return p ? p->version_ : String(); }
1747 String Device::vendorName() const
1748 { return p ? p->vendorName_ : String(); }
1750 int Device::vendorID() const
1751 { return p ? p->vendorID_ : 0; }
1753 String Device::OpenCL_C_Version() const
1754 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1756 String Device::OpenCLVersion() const
1757 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1759 int Device::deviceVersionMajor() const
1760 { return p ? p->deviceVersionMajor_ : 0; }
1762 int Device::deviceVersionMinor() const
1763 { return p ? p->deviceVersionMinor_ : 0; }
1765 String Device::driverVersion() const
1766 { return p ? p->driverVersion_ : String(); }
1768 int Device::type() const
1769 { return p ? p->type_ : 0; }
1771 int Device::addressBits() const
1772 { return p ? p->addressBits_ : 0; }
1774 bool Device::available() const
1775 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1777 bool Device::compilerAvailable() const
1778 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1780 bool Device::linkerAvailable() const
1781 #ifdef CL_VERSION_1_2
1782 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1784 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1787 int Device::doubleFPConfig() const
1788 { return p ? p->doubleFPConfig_ : 0; }
1790 int Device::singleFPConfig() const
1791 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1793 int Device::halfFPConfig() const
1794 #ifdef CL_VERSION_1_2
1795 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1797 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1800 bool Device::endianLittle() const
1801 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1803 bool Device::errorCorrectionSupport() const
1804 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1806 int Device::executionCapabilities() const
1807 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1809 size_t Device::globalMemCacheSize() const
1810 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1812 int Device::globalMemCacheType() const
1813 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1815 int Device::globalMemCacheLineSize() const
1816 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1818 size_t Device::globalMemSize() const
1819 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1821 size_t Device::localMemSize() const
1822 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1824 int Device::localMemType() const
1825 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1827 bool Device::hostUnifiedMemory() const
1828 { return p ? p->hostUnifiedMemory_ : false; }
1830 bool Device::imageSupport() const
1831 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1833 bool Device::imageFromBufferSupport() const
1835 return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1838 uint Device::imagePitchAlignment() const
1840 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1841 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1847 uint Device::imageBaseAddressAlignment() const
1849 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1850 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1856 size_t Device::image2DMaxWidth() const
1857 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1859 size_t Device::image2DMaxHeight() const
1860 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1862 size_t Device::image3DMaxWidth() const
1863 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1865 size_t Device::image3DMaxHeight() const
1866 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1868 size_t Device::image3DMaxDepth() const
1869 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1871 size_t Device::imageMaxBufferSize() const
1872 #ifdef CL_VERSION_1_2
1873 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1875 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1878 size_t Device::imageMaxArraySize() const
1879 #ifdef CL_VERSION_1_2
1880 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1882 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1885 bool Device::intelSubgroupsSupport() const
1886 { return p ? p->intelSubgroupsSupport_ : false; }
1888 int Device::maxClockFrequency() const
1889 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1891 int Device::maxComputeUnits() const
1892 { return p ? p->maxComputeUnits_ : 0; }
1894 int Device::maxConstantArgs() const
1895 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1897 size_t Device::maxConstantBufferSize() const
1898 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1900 size_t Device::maxMemAllocSize() const
1901 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1903 size_t Device::maxParameterSize() const
1904 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1906 int Device::maxReadImageArgs() const
1907 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1909 int Device::maxWriteImageArgs() const
1910 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1912 int Device::maxSamplers() const
1913 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1915 size_t Device::maxWorkGroupSize() const
1916 { return p ? p->maxWorkGroupSize_ : 0; }
1918 int Device::maxWorkItemDims() const
1919 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1921 void Device::maxWorkItemSizes(size_t* sizes) const
1925 const int MAX_DIMS = 32;
1927 CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1928 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1932 int Device::memBaseAddrAlign() const
1933 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1935 int Device::nativeVectorWidthChar() const
1936 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1938 int Device::nativeVectorWidthShort() const
1939 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1941 int Device::nativeVectorWidthInt() const
1942 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1944 int Device::nativeVectorWidthLong() const
1945 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1947 int Device::nativeVectorWidthFloat() const
1948 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1950 int Device::nativeVectorWidthDouble() const
1951 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1953 int Device::nativeVectorWidthHalf() const
1954 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1956 int Device::preferredVectorWidthChar() const
1957 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1959 int Device::preferredVectorWidthShort() const
1960 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1962 int Device::preferredVectorWidthInt() const
1963 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1965 int Device::preferredVectorWidthLong() const
1966 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1968 int Device::preferredVectorWidthFloat() const
1969 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1971 int Device::preferredVectorWidthDouble() const
1972 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1974 int Device::preferredVectorWidthHalf() const
1975 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1977 size_t Device::printfBufferSize() const
1978 #ifdef CL_VERSION_1_2
1979 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1981 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1985 size_t Device::profilingTimerResolution() const
1986 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1988 const Device& Device::getDefault()
1990 auto& c = OpenCLExecutionContext::getCurrent();
1993 return c.getDevice();
1996 static Device dummy;
2000 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2002 template <typename Functor, typename ObjectType>
2003 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2006 cl_int err = f(obj, name, 0, NULL, &required);
2007 if (err != CL_SUCCESS)
2013 AutoBuffer<char> buf(required + 1);
2014 char* ptr = buf.data(); // cleanup is not needed
2015 err = f(obj, name, required, ptr, NULL);
2016 if (err != CL_SUCCESS)
2024 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2029 std::istringstream ss(s);
2033 std::getline(ss, item, delim);
2034 elems.push_back(item);
2038 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2040 // Sample: AMD:GPU:Tahiti
2041 // Sample: :GPU|CPU: = '' = ':' = '::'
2042 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2043 std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2045 std::vector<std::string> parts;
2046 split(configurationStr, ':', parts);
2047 if (parts.size() > 3)
2049 CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2052 if (parts.size() > 2)
2053 deviceNameOrID = parts[2];
2054 if (parts.size() > 1)
2056 split(parts[1], '|', deviceTypes);
2058 if (parts.size() > 0)
2060 platform = parts[0];
2065 #if defined WINRT || defined _WIN32_WCE
2066 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2068 CV_UNUSED(configuration)
2072 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2074 std::string platform, deviceName;
2075 std::vector<std::string> deviceTypes;
2078 configuration = getenv("OPENCV_OPENCL_DEVICE");
2080 if (configuration &&
2081 (strcmp(configuration, "disabled") == 0 ||
2082 !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2088 if (deviceName.length() == 1)
2089 // We limit ID range to 0..9, because we want to write:
2090 // - '2500' to mean i5-2500
2091 // - '8350' to mean AMD FX-8350
2092 // - '650' to mean GeForce 650
2093 // To extend ID range change condition to '> 0'
2096 for (size_t i = 0; i < deviceName.length(); i++)
2098 if (!isdigit(deviceName[i]))
2106 deviceID = atoi(deviceName.c_str());
2112 std::vector<cl_platform_id> platforms;
2114 cl_uint numPlatforms = 0;
2115 CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2117 if (numPlatforms == 0)
2119 platforms.resize((size_t)numPlatforms);
2120 CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2121 platforms.resize(numPlatforms);
2124 int selectedPlatform = -1;
2125 if (platform.length() > 0)
2127 for (size_t i = 0; i < platforms.size(); i++)
2130 CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2131 if (name.find(platform) != std::string::npos)
2133 selectedPlatform = (int)i;
2137 if (selectedPlatform == -1)
2139 CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2143 if (deviceTypes.size() == 0)
2147 deviceTypes.push_back("GPU");
2149 deviceTypes.push_back("CPU");
2152 deviceTypes.push_back("ALL");
2154 for (size_t t = 0; t < deviceTypes.size(); t++)
2157 std::string tempStrDeviceType = deviceTypes[t];
2158 std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2160 if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2161 deviceType = Device::TYPE_GPU;
2162 else if (tempStrDeviceType == "cpu")
2163 deviceType = Device::TYPE_CPU;
2164 else if (tempStrDeviceType == "accelerator")
2165 deviceType = Device::TYPE_ACCELERATOR;
2166 else if (tempStrDeviceType == "all")
2167 deviceType = Device::TYPE_ALL;
2170 CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2174 std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2175 for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2176 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2180 cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2181 if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2183 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2187 size_t base = devices.size();
2188 devices.resize(base + count);
2189 status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2190 if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2192 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2196 for (size_t i = (isID ? deviceID : 0);
2197 (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2201 CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2202 cl_bool useGPU = true;
2203 if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2205 cl_bool isIGPU = CL_FALSE;
2206 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2207 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2209 if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2211 // TODO check for OpenCL 1.1
2219 return NULL; // suppress messages on stderr
2221 std::ostringstream msg;
2222 msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2223 << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2224 << " Device types:";
2225 for (size_t t = 0; t < deviceTypes.size(); t++)
2226 msg << ' ' << deviceTypes[t];
2228 msg << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2230 CV_LOG_ERROR(NULL, msg.str());
2235 #ifdef HAVE_OPENCL_SVM
2238 enum AllocatorFlags { // don't use first 16 bits
2239 OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2240 OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2241 OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2242 OPENCL_SVM_BUFFER_MASK = 3 << 16,
2243 OPENCL_SVM_BUFFER_MAP = 4 << 16
2246 static bool checkForceSVMUmatUsage()
2248 static bool initialized = false;
2249 static bool force = false;
2252 force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2257 static bool checkDisableSVMUMatUsage()
2259 static bool initialized = false;
2260 static bool force = false;
2263 force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2268 static bool checkDisableSVM()
2270 static bool initialized = false;
2271 static bool force = false;
2274 force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2279 // see SVMCapabilities
2280 static unsigned int getSVMCapabilitiesMask()
2282 static bool initialized = false;
2283 static unsigned int mask = 0;
2286 const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2287 if (envValue == NULL)
2289 return ~0U; // all bits 1
2291 mask = atoi(envValue);
2299 static size_t getProgramCountLimit()
2301 static bool initialized = false;
2302 static size_t count = 0;
2305 count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2311 static int g_contextId = 0;
2313 class OpenCLBufferPoolImpl;
2314 class OpenCLSVMBufferPoolImpl;
2316 struct Context::Impl
2318 static Context::Impl* get(Context& context) { return context.p; }
2320 typedef std::deque<Context::Impl*> container_t;
2321 static container_t& getGlobalContainer()
2323 // never delete this container (Impl lifetime is greater due to TLS storage)
2324 static container_t* g_contexts = new container_t();
2329 Impl(const std::string& configuration_)
2331 , contextId(CV_XADD(&g_contextId, 1))
2332 , configuration(configuration_)
2337 #ifdef HAVE_OPENCL_SVM
2338 , svmInitialized(false)
2342 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2344 cv::AutoLock lock(cv::getInitializationMutex());
2345 auto& container = getGlobalContainer();
2346 container.resize(std::max(container.size(), (size_t)contextId + 1));
2347 container[contextId] = this;
2353 if (!cv::__termination)
2358 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2363 directx::internal::deleteDirectXImpl(&p_directx_impl);
2368 cv::AutoLock lock(cv::getInitializationMutex());
2369 auto& container = getGlobalContainer();
2370 CV_CheckLT((size_t)contextId, container.size(), "");
2371 container[contextId] = NULL;
2375 void init_device_list()
2379 cl_uint ndevices = 0;
2380 CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2381 CV_Assert(ndevices > 0);
2383 cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2384 size_t devices_ret_size = 0;
2385 CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2386 CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2389 for (unsigned i = 0; i < ndevices; i++)
2391 devices.emplace_back(Device::fromHandle(cl_devices[i]));
2395 void __init_buffer_pools(); // w/o synchronization
2396 void _init_buffer_pools() const
2400 cv::AutoLock lock(cv::getInitializationMutex());
2403 const_cast<Impl*>(this)->__init_buffer_pools();
2408 static Impl* findContext(const std::string& configuration)
2410 CV_TRACE_FUNCTION();
2411 cv::AutoLock lock(cv::getInitializationMutex());
2412 auto& container = getGlobalContainer();
2413 if (configuration.empty() && !container.empty())
2414 return container[0];
2415 for (auto it = container.begin(); it != container.end(); ++it)
2418 if (i && i->configuration == configuration)
2426 static Impl* findOrCreateContext(const std::string& configuration_)
2428 CV_TRACE_FUNCTION();
2429 std::string configuration = configuration_;
2430 if (configuration_.empty())
2432 const char* c = getenv("OPENCV_OPENCL_DEVICE");
2436 Impl* impl = findContext(configuration);
2439 CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2443 cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2447 impl = new Impl(configuration);
2450 impl->createFromDevice(d);
2463 static Impl* findOrCreateContext(cl_context h)
2465 CV_TRACE_FUNCTION();
2469 std::string configuration = cv::format("@ctx-%p", (void*)h);
2470 Impl* impl = findContext(configuration);
2473 CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2478 impl = new Impl(configuration);
2481 CV_OCL_CHECK(clRetainContext(h));
2483 impl->init_device_list();
2493 static Impl* findOrCreateContext(const ocl::Device& device)
2495 CV_TRACE_FUNCTION();
2497 CV_Assert(!device.empty());
2498 cl_device_id d = (cl_device_id)device.ptr();
2501 std::string configuration = cv::format("@dev-%p", (void*)d);
2502 Impl* impl = findContext(configuration);
2505 CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2510 impl = new Impl(configuration);
2513 impl->createFromDevice(d);
2514 CV_Assert(impl->handle);
2526 CV_TRACE_FUNCTION();
2527 cl_device_id d = selectOpenCLDevice();
2532 createFromDevice(d);
2535 void createFromDevice(cl_device_id d)
2537 CV_TRACE_FUNCTION();
2538 CV_Assert(handle == NULL);
2540 cl_platform_id pl = NULL;
2541 CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2543 cl_context_properties prop[] =
2545 CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2549 // !!! in the current implementation force the number of devices to 1 !!!
2553 handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2554 CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2556 bool ok = handle != 0 && status == CL_SUCCESS;
2566 Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2568 void unloadProg(Program& prog)
2570 cv::AutoLock lock(program_cache_mutex);
2571 for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2573 phash_t::iterator it = phash.find(*i);
2574 if (it != phash.end())
2576 if (it->second.ptr() == prog.ptr())
2586 std::string& getPrefixString()
2590 cv::AutoLock lock(program_cache_mutex);
2593 CV_Assert(!devices.empty());
2594 const Device& d = devices[0];
2595 int bits = d.addressBits();
2596 if (bits > 0 && bits != 64)
2597 prefix = cv::format("%d-bit--", bits);
2598 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2600 for (size_t i = 0; i < prefix.size(); i++)
2603 if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2613 std::string& getPrefixBase()
2615 if (prefix_base.empty())
2617 cv::AutoLock lock(program_cache_mutex);
2618 if (prefix_base.empty())
2620 const Device& d = devices[0];
2621 int bits = d.addressBits();
2622 if (bits > 0 && bits != 64)
2623 prefix_base = cv::format("%d-bit--", bits);
2624 prefix_base += d.vendorName() + "--" + d.name() + "--";
2626 for (size_t i = 0; i < prefix_base.size(); i++)
2628 char c = prefix_base[i];
2629 if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2631 prefix_base[i] = '_';
2639 IMPLEMENT_REFCOUNTABLE();
2641 const int contextId; // global unique ID
2642 const std::string configuration;
2645 std::vector<Device> devices;
2648 std::string prefix_base;
2650 cv::Mutex program_cache_mutex;
2651 typedef std::map<std::string, Program> phash_t;
2653 typedef std::list<cv::String> CacheList;
2654 CacheList cacheList;
2656 std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2657 std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2658 OpenCLBufferPoolImpl& getBufferPool() const
2660 _init_buffer_pools();
2661 CV_DbgAssert(bufferPool_);
2662 return *bufferPool_.get();
2664 OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2666 _init_buffer_pools();
2667 CV_DbgAssert(bufferPoolHostPtr_);
2668 return *bufferPoolHostPtr_.get();
2672 directx::internal::OpenCLDirectXImpl* p_directx_impl;
2674 directx::internal::OpenCLDirectXImpl* getDirectXImpl()
2676 if (!p_directx_impl)
2678 p_directx_impl = directx::internal::createDirectXImpl();
2680 return p_directx_impl;
2684 #ifdef HAVE_OPENCL_SVM
2685 bool svmInitialized;
2688 svm::SVMCapabilities svmCapabilities;
2689 svm::SVMFunctions svmFunctions;
2693 CV_Assert(handle != NULL);
2694 const Device& device = devices[0];
2695 cl_device_svm_capabilities deviceCaps = 0;
2696 CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2697 cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2698 if (status != CL_SUCCESS)
2700 CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2703 CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2704 CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2705 svmCapabilities.value_ =
2706 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2707 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2708 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2709 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2710 svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2711 if (svmCapabilities.value_ == 0)
2713 CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2719 CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2720 void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2723 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2724 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2729 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2730 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2732 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2733 CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2738 ((int*)ptr)[0] = 100;
2742 CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2745 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2747 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2748 CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2753 CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2758 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2759 clSVMFree(handle, ptr);
2762 clSVMFree(handle, ptr);
2763 svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2764 svmFunctions.fn_clSVMFree = clSVMFree;
2765 svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2766 //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2767 //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2768 svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2769 svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2770 svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2771 svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2775 CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2778 // Try HSA extension
2779 String extensions = device.extensions();
2780 if (extensions.find("cl_amd_svm") == String::npos)
2782 CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2785 cl_platform_id p = NULL;
2786 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2787 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2788 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2789 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2790 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2791 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2792 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2793 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2794 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2795 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2796 CV_Assert(svmFunctions.isValid());
2800 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2805 svmAvailable = true;
2806 svmEnabled = !svm::checkDisableSVM();
2807 svmInitialized = true;
2808 CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2811 CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2812 svmAvailable = false;
2814 svmCapabilities.value_ = 0;
2815 svmInitialized = true;
2816 svmFunctions.fn_clSVMAlloc = NULL;
2820 std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2822 OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2824 _init_buffer_pools();
2825 CV_DbgAssert(bufferPoolSVM_);
2826 return *bufferPoolSVM_.get();
2830 friend class Program;
2845 Context::Context(int dtype)
2851 void Context::release()
2860 bool Context::create()
2865 p = Impl::findOrCreateContext(std::string());
2873 bool Context::create(int dtype)
2878 if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2880 p = Impl::findOrCreateContext("");
2882 else if (dtype == CL_DEVICE_TYPE_GPU)
2884 p = Impl::findOrCreateContext(":GPU:");
2886 else if (dtype == CL_DEVICE_TYPE_CPU)
2888 p = Impl::findOrCreateContext(":CPU:");
2892 CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2894 if (p && !p->handle)
2901 Context::Context(const Context& c)
2908 Context& Context::operator = (const Context& c)
2910 Impl* newp = (Impl*)c.p;
2919 void* Context::ptr() const
2921 return p == NULL ? NULL : p->handle;
2924 size_t Context::ndevices() const
2926 return p ? p->devices.size() : 0;
2929 Device& Context::device(size_t idx) const
2931 static Device dummy;
2932 return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2935 Context& Context::getDefault(bool initialize)
2937 auto& c = OpenCLExecutionContext::getCurrent();
2940 auto& ctx = c.getContext();
2944 CV_UNUSED(initialize);
2945 static Context dummy;
2949 Program Context::getProg(const ProgramSource& prog,
2950 const String& buildopts, String& errmsg)
2952 return p ? p->getProg(prog, buildopts, errmsg) : Program();
2955 void Context::unloadProg(Program& prog)
2958 p->unloadProg(prog);
2962 Context Context::fromHandle(void* context)
2965 ctx.p = Impl::findOrCreateContext((cl_context)context);
2970 Context Context::fromDevice(const ocl::Device& device)
2973 ctx.p = Impl::findOrCreateContext(device);
2978 Context Context::create(const std::string& configuration)
2981 ctx.p = Impl::findOrCreateContext(configuration);
2985 #ifdef HAVE_OPENCL_SVM
2986 bool Context::useSVM() const
2988 Context::Impl* i = p;
2990 if (!i->svmInitialized)
2992 return i->svmEnabled;
2994 void Context::setUseSVM(bool enabled)
2996 Context::Impl* i = p;
2998 if (!i->svmInitialized)
3000 if (enabled && !i->svmAvailable)
3002 CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
3004 i->svmEnabled = enabled;
3007 bool Context::useSVM() const { return false; }
3008 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
3011 #ifdef HAVE_OPENCL_SVM
3014 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
3016 Context::Impl* i = context.p;
3018 if (!i->svmInitialized)
3020 return i->svmCapabilities;
3023 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3025 Context::Impl* i = context.p;
3027 CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3028 CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3029 return &i->svmFunctions;
3032 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3034 if (checkForceSVMUmatUsage())
3036 if (checkDisableSVMUMatUsage())
3038 if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3040 return false; // don't use SVM by default
3043 } // namespace cv::ocl::svm
3044 #endif // HAVE_OPENCL_SVM
3047 static void get_platform_name(cl_platform_id id, String& name)
3049 // get platform name string length
3051 CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3053 // get platform name string
3054 AutoBuffer<char> buf(sz + 1);
3055 CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3057 // just in case, ensure trailing zero for ASCIIZ string
3064 // Attaches OpenCL context to OpenCV
3066 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3068 auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3073 OpenCLExecutionContext OpenCLExecutionContext::create(
3074 const std::string& platformName, void* platformID, void* context, void* deviceID
3078 CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3081 CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3084 CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3086 std::vector<cl_platform_id> platforms(cnt);
3088 CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3090 bool platformAvailable = false;
3092 // check if external platformName contained in list of available platforms in OpenCV
3093 for (unsigned int i = 0; i < cnt; i++)
3095 String availablePlatformName;
3096 get_platform_name(platforms[i], availablePlatformName);
3097 // external platform is found in the list of available platforms
3098 if (platformName == availablePlatformName)
3100 platformAvailable = true;
3105 if (!platformAvailable)
3106 CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3108 // check if platformID corresponds to platformName
3109 String actualPlatformName;
3110 get_platform_name((cl_platform_id)platformID, actualPlatformName);
3111 if (platformName != actualPlatformName)
3112 CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3114 OpenCLExecutionContext ctx;
3115 ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3116 CV_OCL_CHECK(clReleaseContext((cl_context)context));
3117 CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3121 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3123 // internal call, less checks
3124 cl_platform_id platformID = (cl_platform_id)_platform;
3125 cl_context context = (cl_context)_context;
3126 cl_device_id deviceID = (cl_device_id)_device;
3128 std::string platformName = PlatformInfo(&platformID).name();
3130 auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3131 CV_Assert(!clExecCtx.empty());
3132 ctx = clExecCtx.getContext();
3135 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3139 inline void __init()
3143 isProfilingQueue_ = false;
3146 Impl(cl_command_queue q)
3151 cl_command_queue_properties props = 0;
3152 CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3153 isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3156 Impl(cl_command_queue q, bool isProfilingQueue)
3160 isProfilingQueue_ = isProfilingQueue;
3163 Impl(const Context& c, const Device& d, bool withProfiling = false)
3167 const Context* pc = &c;
3168 cl_context ch = (cl_context)pc->ptr();
3171 pc = &Context::getDefault();
3172 ch = (cl_context)pc->ptr();
3174 cl_device_id dh = (cl_device_id)d.ptr();
3176 dh = (cl_device_id)pc->device(0).ptr();
3178 cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3179 CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3180 isProfilingQueue_ = withProfiling;
3186 if (!cv::__termination)
3191 CV_OCL_DBG_CHECK(clFinish(handle));
3192 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3198 const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3200 if (isProfilingQueue_)
3203 if (profiling_queue_.ptr())
3204 return profiling_queue_;
3207 CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3209 cl_device_id device = 0;
3210 CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3212 cl_int result = CL_SUCCESS;
3213 cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3214 cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3215 CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3218 queue.p = new Impl(q, true);
3219 profiling_queue_ = queue;
3221 return profiling_queue_;
3224 IMPLEMENT_REFCOUNTABLE();
3226 cl_command_queue handle;
3227 bool isProfilingQueue_;
3228 cv::ocl::Queue profiling_queue_;
3236 Queue::Queue(const Context& c, const Device& d)
3242 Queue::Queue(const Queue& q)
3249 Queue& Queue::operator = (const Queue& q)
3251 Impl* newp = (Impl*)q.p;
3266 bool Queue::create(const Context& c, const Device& d)
3271 return p->handle != 0;
3274 void Queue::finish()
3278 CV_OCL_DBG_CHECK(clFinish(p->handle));
3282 const Queue& Queue::getProfilingQueue() const
3285 return p->getProfilingQueue(*this);
3288 void* Queue::ptr() const
3290 return p ? p->handle : 0;
3293 Queue& Queue::getDefault()
3295 auto& c = OpenCLExecutionContext::getCurrent();
3298 auto& q = c.getQueue();
3305 static cl_command_queue getQueue(const Queue& q)
3307 cl_command_queue qq = (cl_command_queue)q.ptr();
3309 qq = (cl_command_queue)Queue::getDefault().ptr();
3313 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3315 KernelArg::KernelArg()
3316 : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3320 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3321 : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3323 CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3326 KernelArg KernelArg::Constant(const Mat& m)
3328 CV_Assert(m.isContinuous());
3329 return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3332 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3336 Impl(const char* kname, const Program& prog) :
3337 refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3339 cl_program ph = (cl_program)prog.ptr();
3344 handle = clCreateKernel(ph, kname, &retval);
3345 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3347 for( int i = 0; i < MAX_ARRS; i++ )
3349 haveTempDstUMats = false;
3350 haveTempSrcUMats = false;
3355 for( int i = 0; i < MAX_ARRS; i++ )
3358 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3360 u[i]->flags |= UMatData::ASYNC_CLEANUP;
3361 u[i]->currAllocator->deallocate(u[i]);
3366 haveTempDstUMats = false;
3367 haveTempSrcUMats = false;
3370 void addUMat(const UMat& m, bool dst)
3372 CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3374 CV_XADD(&m.u->urefcount, 1);
3376 if(dst && m.u->tempUMat())
3377 haveTempDstUMats = true;
3378 if(m.u->originalUMatData == NULL && m.u->tempUMat())
3379 haveTempSrcUMats = true; // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3382 void addImage(const Image2D& image)
3384 images.push_back(image);
3387 void finit(cl_event e)
3392 isInProgress = false;
3396 bool run(int dims, size_t _globalsize[], size_t _localsize[],
3397 bool sync, int64* timeNS, const Queue& q);
3403 CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3407 IMPLEMENT_REFCOUNTABLE();
3411 enum { MAX_ARRS = 16 };
3412 UMatData* u[MAX_ARRS];
3414 bool isAsyncRun; // true if kernel was scheduled in async mode
3416 std::list<Image2D> images;
3417 bool haveTempDstUMats;
3418 bool haveTempSrcUMats;
3421 }} // namespace cv::ocl
3425 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3429 ((cv::ocl::Kernel::Impl*)p)->finit(e);
3431 catch (const cv::Exception& exc)
3433 CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3435 catch (const std::exception& exc)
3437 CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3441 CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3447 namespace cv { namespace ocl {
3454 Kernel::Kernel(const char* kname, const Program& prog)
3457 create(kname, prog);
3460 Kernel::Kernel(const char* kname, const ProgramSource& src,
3461 const String& buildopts, String* errmsg)
3464 create(kname, src, buildopts, errmsg);
3467 Kernel::Kernel(const Kernel& k)
3474 Kernel& Kernel::operator = (const Kernel& k)
3476 Impl* newp = (Impl*)k.p;
3491 bool Kernel::create(const char* kname, const Program& prog)
3495 p = new Impl(kname, prog);
3501 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3507 bool Kernel::create(const char* kname, const ProgramSource& src,
3508 const String& buildopts, String* errmsg)
3516 if( !errmsg ) errmsg = &tempmsg;
3517 const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3518 return create(kname, prog);
3521 void* Kernel::ptr() const
3523 return p ? p->handle : 0;
3526 bool Kernel::empty() const
3531 int Kernel::set(int i, const void* value, size_t sz)
3533 if (!p || !p->handle)
3540 cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3541 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
3542 if (retval != CL_SUCCESS)
3547 int Kernel::set(int i, const Image2D& image2D)
3549 p->addImage(image2D);
3550 cl_mem h = (cl_mem)image2D.ptr();
3551 return set(i, &h, sizeof(h));
3554 int Kernel::set(int i, const UMat& m)
3556 return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3559 int Kernel::set(int i, const KernelArg& arg)
3561 if( !p || !p->handle )
3565 CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3566 p->name.c_str(), (int)i));
3574 AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3575 ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3576 bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3577 if (ptronly && arg.m->empty())
3579 cl_mem h_null = (cl_mem)NULL;
3580 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3581 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3584 cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3588 CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3589 p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3595 #ifdef HAVE_OPENCL_SVM
3596 if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3598 const Context& ctx = Context::getDefault();
3599 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3600 uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3601 CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3603 status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3605 status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3607 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3612 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3613 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3620 else if( arg.m->dims <= 2 )
3623 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3624 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3625 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3626 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3629 if( !(arg.flags & KernelArg::NO_SIZE) )
3631 int cols = u2d.cols*arg.wscale/arg.iwscale;
3632 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3633 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3634 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3635 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3642 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3643 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3644 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3645 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3646 status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3647 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3649 if( !(arg.flags & KernelArg::NO_SIZE) )
3651 int cols = u3d.cols*arg.wscale/arg.iwscale;
3652 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3653 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3654 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3655 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3656 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3657 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3661 p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3664 status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3665 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3669 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3670 bool sync, const Queue& q)
3675 size_t globalsize[CV_MAX_DIM] = {1,1,1};
3677 CV_Assert(_globalsize != NULL);
3678 for (int i = 0; i < dims; i++)
3680 size_t val = _localsize ? _localsize[i] :
3681 dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3682 CV_Assert( val > 0 );
3683 total *= _globalsize[i];
3684 if (_globalsize[i] == 1 && !_localsize)
3686 globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3688 CV_Assert(total > 0);
3690 return p->run(dims, globalsize, _localsize, sync, NULL, q);
3694 static bool isRaiseErrorOnReuseAsyncKernel()
3696 static bool initialized = false;
3697 static bool value = false;
3700 value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3706 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3707 bool sync, int64* timeNS, const Queue& q)
3709 CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3713 CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3719 CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3720 if (isRaiseErrorOnReuseAsyncKernel())
3722 return false; // OpenCV 5.0: raise error
3728 CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3729 if (isRaiseErrorOnReuseAsyncKernel())
3731 return false; // OpenCV 5.0: raise error
3734 cl_command_queue qq = getQueue(q);
3735 if (haveTempDstUMats)
3737 if (haveTempSrcUMats)
3741 cl_event asyncEvent = 0;
3742 cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3743 NULL, globalsize, localsize, 0, 0,
3744 (sync && !timeNS) ? 0 : &asyncEvent);
3745 #if !CV_OPENCL_SHOW_RUN_KERNELS
3746 if (retval != CL_SUCCESS)
3749 cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3750 globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3751 (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3752 sync ? "true" : "false"
3754 if (retval != CL_SUCCESS)
3756 msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3758 #if CV_OPENCL_TRACE_CHECK
3759 CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3761 printf("%s\n", msg.c_str());
3765 if (sync || retval != CL_SUCCESS)
3767 CV_OCL_DBG_CHECK(clFinish(qq));
3770 if (retval == CL_SUCCESS)
3772 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3773 cl_ulong startTime, stopTime;
3774 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3775 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3776 *timeNS = (int64)(stopTime - startTime);
3788 isInProgress = true;
3789 CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3792 CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3793 return retval == CL_SUCCESS;
3796 bool Kernel::runTask(bool sync, const Queue& q)
3798 if(!p || !p->handle || p->isInProgress)
3801 cl_command_queue qq = getQueue(q);
3802 cl_event asyncEvent = 0;
3803 cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3804 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3805 if (sync || retval != CL_SUCCESS)
3807 CV_OCL_DBG_CHECK(clFinish(qq));
3813 p->isInProgress = true;
3814 CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3817 CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3818 return retval == CL_SUCCESS;
3821 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3823 CV_Assert(p && p->handle && !p->isInProgress);
3824 Queue q = q_.ptr() ? q_ : Queue::getDefault();
3826 q.finish(); // call clFinish() on base queue
3827 Queue profilingQueue = q.getProfilingQueue();
3829 bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3830 return res ? timeNs : -1;
3833 size_t Kernel::workGroupSize() const
3835 if(!p || !p->handle)
3837 size_t val = 0, retsz = 0;
3838 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3839 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3840 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3841 return status == CL_SUCCESS ? val : 0;
3844 size_t Kernel::preferedWorkGroupSizeMultiple() const
3846 if(!p || !p->handle)
3848 size_t val = 0, retsz = 0;
3849 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3850 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3851 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3852 return status == CL_SUCCESS ? val : 0;
3855 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3857 if(!p || !p->handle || !wsz)
3860 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3861 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3862 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3863 return status == CL_SUCCESS;
3866 size_t Kernel::localMemSize() const
3868 if(!p || !p->handle)
3872 cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3873 cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3874 CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3875 return status == CL_SUCCESS ? (size_t)val : 0;
3880 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3882 struct ProgramSource::Impl
3884 IMPLEMENT_REFCOUNTABLE();
3887 PROGRAM_SOURCE_CODE = 0,
3893 Impl(const String& src)
3895 init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3896 initFromSource(src, cv::String());
3898 Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3900 init(PROGRAM_SOURCE_CODE, module, name);
3901 initFromSource(codeStr, codeHash);
3905 void init(enum KIND kind, const String& module, const String& name)
3914 isHashUpdated = false;
3917 void initFromSource(const String& codeStr, const String& codeHash)
3920 sourceHash_ = codeHash;
3921 if (sourceHash_.empty())
3927 isHashUpdated = true;
3931 void updateHash(const char* hashStr = NULL)
3935 sourceHash_ = cv::String(hashStr);
3936 isHashUpdated = true;
3942 case PROGRAM_SOURCE_CODE:
3945 CV_Assert(codeStr_.empty());
3946 hash = crc64(sourceAddr_, sourceSize_); // static storage
3950 CV_Assert(!codeStr_.empty());
3951 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3954 case PROGRAM_BINARIES:
3957 hash = crc64(sourceAddr_, sourceSize_);
3960 CV_Error(Error::StsInternal, "Internal error");
3962 sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3963 isHashUpdated = true;
3966 Impl(enum KIND kind,
3967 const String& module, const String& name,
3968 const unsigned char* binary, const size_t size,
3969 const cv::String& buildOptions = cv::String())
3971 init(kind, module, name);
3973 sourceAddr_ = binary;
3976 buildOptions_ = buildOptions;
3979 static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3980 const char* sourceCodeStaticStr, const char* hashStaticStr,
3981 const cv::String& buildOptions)
3983 ProgramSource result;
3984 result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3985 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3986 result.p->updateHash(hashStaticStr);
3990 static ProgramSource fromBinary(const String& module, const String& name,
3991 const unsigned char* binary, const size_t size,
3992 const cv::String& buildOptions)
3994 ProgramSource result;
3995 result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3999 static ProgramSource fromSPIR(const String& module, const String& name,
4000 const unsigned char* binary, const size_t size,
4001 const cv::String& buildOptions)
4003 ProgramSource result;
4004 result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
4011 // TODO std::vector<ProgramSource> includes_;
4012 String codeStr_; // PROGRAM_SOURCE_CODE only
4014 const unsigned char* sourceAddr_;
4017 cv::String buildOptions_;
4022 friend struct Program::Impl;
4023 friend struct internal::ProgramEntry;
4024 friend struct Context::Impl;
4028 ProgramSource::ProgramSource()
4033 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4035 p = new Impl(module, name, codeStr, codeHash);
4038 ProgramSource::ProgramSource(const char* prog)
4043 ProgramSource::ProgramSource(const String& prog)
4048 ProgramSource::~ProgramSource()
4054 ProgramSource::ProgramSource(const ProgramSource& prog)
4061 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4063 Impl* newp = (Impl*)prog.p;
4072 const String& ProgramSource::source() const
4075 CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4076 CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4080 ProgramSource::hash_t ProgramSource::hash() const
4082 CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4085 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4086 const unsigned char* binary, const size_t size,
4087 const cv::String& buildOptions)
4090 CV_Assert(size > 0);
4091 return Impl::fromBinary(module, name, binary, size, buildOptions);
4094 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4095 const unsigned char* binary, const size_t size,
4096 const cv::String& buildOptions)
4099 CV_Assert(size > 0);
4100 return Impl::fromBinary(module, name, binary, size, buildOptions);
4104 internal::ProgramEntry::operator ProgramSource&() const
4106 if (this->pProgramSource == NULL)
4108 cv::AutoLock lock(cv::getInitializationMutex());
4109 if (this->pProgramSource == NULL)
4111 ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4112 ProgramSource* ptr = new ProgramSource(ps);
4113 const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4116 return *this->pProgramSource;
4121 /////////////////////////////////////////// Program /////////////////////////////////////////////
4124 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4132 return a + (cv::String(" ") + b);
4135 struct Program::Impl
4137 IMPLEMENT_REFCOUNTABLE();
4139 Impl(const ProgramSource& src,
4140 const String& _buildflags, String& errmsg) :
4143 buildflags(_buildflags)
4145 const ProgramSource::Impl* src_ = src.getImpl();
4147 sourceModule_ = src_->module_;
4148 sourceName_ = src_->name_;
4149 const Context ctx = Context::getDefault();
4150 Device device = ctx.device(0);
4151 if (ctx.ptr() == NULL || device.ptr() == NULL)
4153 buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4154 if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4157 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4158 else if (device.isIntel())
4159 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4160 const String param_buildExtraOptions = getBuildExtraOptions();
4161 if (!param_buildExtraOptions.empty())
4162 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4164 compile(ctx, src_, errmsg);
4167 bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4169 CV_Assert(ctx.getImpl());
4172 // We don't cache OpenCL binaries
4173 if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4175 CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4176 bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4179 return compileWithCache(ctx, src_, errmsg);
4182 bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4184 CV_Assert(ctx.getImpl());
4186 CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4188 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4189 OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4190 const std::string base_dir = config.prepareCacheDirectoryForContext(
4191 ctx.getImpl()->getPrefixString(),
4192 ctx.getImpl()->getPrefixBase()
4194 const String& hash_str = src_->sourceHash_;
4196 if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4198 CV_Assert(!hash_str.empty());
4199 fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4200 fname = utils::fs::join(base_dir, fname);
4202 const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4203 if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4207 std::vector<char> binaryBuf;
4210 cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4211 BinaryProgramFile file(fname, hash_str.c_str());
4212 res = file.read(buildflags, binaryBuf);
4216 CV_Assert(!binaryBuf.empty());
4217 CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4218 bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4223 catch (const cv::Exception& e)
4226 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4230 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4233 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4234 CV_Assert(handle == NULL);
4235 if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4237 if (!buildFromSources(ctx, src_, errmsg))
4242 else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4244 buildflags = joinBuildOptions(buildflags, " -x spir");
4245 if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4247 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4249 CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4250 bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4254 else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4256 CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4260 CV_Error(Error::StsInternal, "Internal error");
4262 CV_Assert(handle != NULL);
4263 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4264 if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4268 std::vector<char> binaryBuf;
4269 getProgramBinary(binaryBuf);
4271 cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4272 BinaryProgramFile file(fname, hash_str.c_str());
4273 file.write(buildflags, binaryBuf);
4276 catch (const cv::Exception& e)
4278 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4282 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4285 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4286 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4287 if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4289 std::vector<char> binaryBuf;
4290 getProgramBinary(binaryBuf);
4291 if (!binaryBuf.empty())
4293 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4295 createFromBinary(ctx, binaryBuf, errmsg);
4299 return handle != NULL;
4302 void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4304 AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4307 cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4308 CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4309 if (log_retval == CL_SUCCESS && retsz > 1)
4311 buffer.resize(retsz + 16);
4312 log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4313 CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4314 if (log_retval == CL_SUCCESS)
4316 if (retsz < buffer.size())
4319 buffer[buffer.size() - 1] = 0;
4327 errmsg = String(buffer.data());
4328 printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4329 sourceModule_.c_str(), sourceName_.c_str(),
4330 result, getOpenCLErrorString(result),
4331 buildflags.c_str(), errmsg.c_str());
4335 bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4338 CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4339 CV_Assert(handle == NULL);
4340 CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4341 sourceModule_.c_str(), sourceName_.c_str(),
4342 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4344 CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4346 const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4347 size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4348 CV_Assert(srcptr != NULL);
4349 CV_Assert(srclen > 0);
4353 handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4354 CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4355 CV_Assert(handle || retval != CL_SUCCESS);
4356 if (handle && retval == CL_SUCCESS)
4358 size_t n = ctx.ndevices();
4359 AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4360 cl_device_id* deviceList = deviceListBuf.data();
4361 for (size_t i = 0; i < n; i++)
4363 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4366 retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4367 CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4368 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4369 if (retval != CL_SUCCESS)
4372 dumpBuildLog_(retval, deviceList, errmsg);
4374 // don't remove "retval != CL_SUCCESS" condition here:
4375 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4376 if (retval != CL_SUCCESS && handle)
4378 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4382 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4383 if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4385 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4387 char kernels_buffer[4096] = {0};
4388 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4389 if (retsz < sizeof(kernels_buffer))
4390 kernels_buffer[retsz] = 0;
4392 kernels_buffer[0] = 0;
4393 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4398 return handle != NULL;
4401 void getProgramBinary(std::vector<char>& buf)
4405 CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4407 uchar* ptr = (uchar*)&buf[0];
4408 CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4411 bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4413 return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4416 bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4418 CV_Assert(handle == NULL);
4419 CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4420 CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4422 CV_Assert(binarySize > 0);
4424 size_t ndevices = (int)ctx.ndevices();
4425 AutoBuffer<cl_device_id> devices_(ndevices);
4426 AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4427 AutoBuffer<size_t> binarySizes_(ndevices);
4429 cl_device_id* devices = devices_.data();
4430 const uchar** binaryPtrs = binaryPtrs_.data();
4431 size_t* binarySizes = binarySizes_.data();
4432 for (size_t i = 0; i < ndevices; i++)
4434 devices[i] = (cl_device_id)ctx.device(i).ptr();
4435 binaryPtrs[i] = binaryAddr;
4436 binarySizes[i] = binarySize;
4440 handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4441 binarySizes, binaryPtrs, NULL, &result);
4442 if (result != CL_SUCCESS)
4444 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4447 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4455 // call clBuildProgram()
4457 result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4458 CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4459 if (result != CL_SUCCESS)
4461 dumpBuildLog_(result, devices, errmsg);
4464 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4470 // check build status
4472 cl_build_status build_status = CL_BUILD_NONE;
4474 CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4475 sizeof(build_status), &build_status, &retsz));
4476 if (result == CL_SUCCESS)
4478 if (build_status == CL_BUILD_SUCCESS)
4484 CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4490 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4493 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4498 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4499 if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4501 CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4503 char kernels_buffer[4096] = {0};
4504 result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4505 if (retsz < sizeof(kernels_buffer))
4506 kernels_buffer[retsz] = 0;
4508 kernels_buffer[0] = 0;
4509 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4512 return handle != NULL;
4520 if (!cv::__termination)
4523 clReleaseProgram(handle);
4532 String sourceModule_;
4537 Program::Program() { p = 0; }
4539 Program::Program(const ProgramSource& src,
4540 const String& buildflags, String& errmsg)
4543 create(src, buildflags, errmsg);
4546 Program::Program(const Program& prog)
4553 Program& Program::operator = (const Program& prog)
4555 Impl* newp = (Impl*)prog.p;
4570 bool Program::create(const ProgramSource& src,
4571 const String& buildflags, String& errmsg)
4578 p = new Impl(src, buildflags, errmsg);
4587 void* Program::ptr() const
4589 return p ? p->handle : 0;
4592 #ifndef OPENCV_REMOVE_DEPRECATED_API
4593 const ProgramSource& Program::source() const
4595 CV_Error(Error::StsNotImplemented, "Removed API");
4598 bool Program::read(const String& bin, const String& buildflags)
4600 CV_UNUSED(bin); CV_UNUSED(buildflags);
4601 CV_Error(Error::StsNotImplemented, "Removed API");
4604 bool Program::write(String& bin) const
4607 CV_Error(Error::StsNotImplemented, "Removed API");
4610 String Program::getPrefix() const
4614 Context::Impl* ctx_ = Context::getDefault().getImpl();
4616 return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4619 String Program::getPrefix(const String& buildflags)
4621 Context::Impl* ctx_ = Context::getDefault().getImpl();
4623 return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4625 #endif // OPENCV_REMOVE_DEPRECATED_API
4627 void Program::getBinary(std::vector<char>& binary) const
4629 CV_Assert(p && "Empty program");
4630 p->getProgramBinary(binary);
4633 Program Context::Impl::getProg(const ProgramSource& src,
4634 const String& buildflags, String& errmsg)
4636 size_t limit = getProgramCountLimit();
4637 const ProgramSource::Impl* src_ = src.getImpl();
4639 String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4640 src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4641 getPrefixString().c_str(),
4642 buildflags.c_str());
4644 cv::AutoLock lock(program_cache_mutex);
4645 phash_t::iterator it = phash.find(key);
4646 if (it != phash.end())
4649 CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4650 if (i != cacheList.end() && i != cacheList.begin())
4653 cacheList.push_front(key);
4657 { // cleanup program cache
4658 size_t sz = phash.size();
4659 if (limit > 0 && sz >= limit)
4661 static bool warningFlag = false;
4664 printf("\nWARNING: OpenCV-OpenCL:\n"
4665 " In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4666 " You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4669 while (!cacheList.empty())
4671 size_t c = phash.erase(cacheList.back());
4672 cacheList.pop_back();
4679 Program prog(src, buildflags, errmsg);
4680 // Cache result of build failures too (to prevent unnecessary compiler invocations)
4682 cv::AutoLock lock(program_cache_mutex);
4683 phash.insert(std::pair<std::string, Program>(key, prog));
4684 cacheList.push_front(key);
4690 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4692 template<typename T>
4693 class OpenCLBufferPool
4696 ~OpenCLBufferPool() { }
4698 virtual T allocate(size_t size) = 0;
4699 virtual void release(T buffer) = 0;
4702 template <typename Derived, typename BufferEntry, typename T>
4703 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4706 inline Derived& derived() { return *static_cast<Derived*>(this); }
4710 size_t currentReservedSize;
4711 size_t maxReservedSize;
4713 std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4714 std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4717 bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4719 typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4720 for (; i != allocatedEntries_.end(); ++i)
4722 BufferEntry& e = *i;
4723 if (e.clBuffer_ == buffer)
4726 allocatedEntries_.erase(i);
4734 bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4736 if (reservedEntries_.empty())
4738 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4739 typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4741 size_t minDiff = (size_t)(-1);
4742 for (; i != reservedEntries_.end(); ++i)
4744 BufferEntry& e = *i;
4745 if (e.capacity_ >= size)
4747 size_t diff = e.capacity_ - size;
4748 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4758 if (result_pos != reservedEntries_.end())
4760 //CV_DbgAssert(result == *result_pos);
4761 reservedEntries_.erase(result_pos);
4763 currentReservedSize -= entry.capacity_;
4764 allocatedEntries_.push_back(entry);
4771 void _checkSizeOfReservedEntries()
4773 while (currentReservedSize > maxReservedSize)
4775 CV_DbgAssert(!reservedEntries_.empty());
4776 const BufferEntry& entry = reservedEntries_.back();
4777 CV_DbgAssert(currentReservedSize >= entry.capacity_);
4778 currentReservedSize -= entry.capacity_;
4779 derived()._releaseBufferEntry(entry);
4780 reservedEntries_.pop_back();
4784 inline size_t _allocationGranularity(size_t size)
4787 if (size < 1024*1024)
4788 return 4096; // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4789 else if (size < 16*1024*1024)
4796 OpenCLBufferPoolBaseImpl()
4797 : currentReservedSize(0),
4802 virtual ~OpenCLBufferPoolBaseImpl()
4804 freeAllReservedBuffers();
4805 CV_Assert(reservedEntries_.empty());
4808 virtual T allocate(size_t size) CV_OVERRIDE
4810 AutoLock locker(mutex_);
4812 if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4814 CV_DbgAssert(size <= entry.capacity_);
4815 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4819 derived()._allocateBufferEntry(entry, size);
4821 return entry.clBuffer_;
4823 virtual void release(T buffer) CV_OVERRIDE
4825 AutoLock locker(mutex_);
4827 CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4828 if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4830 derived()._releaseBufferEntry(entry);
4834 reservedEntries_.push_front(entry);
4835 currentReservedSize += entry.capacity_;
4836 _checkSizeOfReservedEntries();
4840 virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4841 virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4842 virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4844 AutoLock locker(mutex_);
4845 size_t oldMaxReservedSize = maxReservedSize;
4846 maxReservedSize = size;
4847 if (maxReservedSize < oldMaxReservedSize)
4849 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4850 for (; i != reservedEntries_.end();)
4852 const BufferEntry& entry = *i;
4853 if (entry.capacity_ > maxReservedSize / 8)
4855 CV_DbgAssert(currentReservedSize >= entry.capacity_);
4856 currentReservedSize -= entry.capacity_;
4857 derived()._releaseBufferEntry(entry);
4858 i = reservedEntries_.erase(i);
4863 _checkSizeOfReservedEntries();
4866 virtual void freeAllReservedBuffers() CV_OVERRIDE
4868 AutoLock locker(mutex_);
4869 typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4870 for (; i != reservedEntries_.end(); ++i)
4872 const BufferEntry& entry = *i;
4873 derived()._releaseBufferEntry(entry);
4875 reservedEntries_.clear();
4876 currentReservedSize = 0;
4880 struct CLBufferEntry
4884 CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4887 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4890 typedef struct CLBufferEntry BufferEntry;
4894 OpenCLBufferPoolImpl(int createFlags = 0)
4895 : createFlags_(createFlags)
4899 void _allocateBufferEntry(BufferEntry& entry, size_t size)
4901 CV_DbgAssert(entry.clBuffer_ == NULL);
4902 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4903 Context& ctx = Context::getDefault();
4904 cl_int retval = CL_SUCCESS;
4905 entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4906 CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4907 CV_Assert(entry.clBuffer_ != NULL);
4908 if(retval == CL_SUCCESS)
4910 CV_IMPL_ADD(CV_IMPL_OCL);
4912 LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4913 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4914 allocatedEntries_.push_back(entry);
4917 void _releaseBufferEntry(const BufferEntry& entry)
4919 CV_Assert(entry.capacity_ != 0);
4920 CV_Assert(entry.clBuffer_ != NULL);
4921 LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4922 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4923 CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4927 #ifdef HAVE_OPENCL_SVM
4928 struct CLSVMBufferEntry
4932 CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4934 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4937 typedef struct CLSVMBufferEntry BufferEntry;
4939 OpenCLSVMBufferPoolImpl()
4943 void _allocateBufferEntry(BufferEntry& entry, size_t size)
4945 CV_DbgAssert(entry.clBuffer_ == NULL);
4946 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4948 Context& ctx = Context::getDefault();
4949 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4950 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4951 cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4952 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4954 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4955 CV_DbgAssert(svmFns->isValid());
4957 CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4958 void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4961 entry.clBuffer_ = buf;
4963 CV_IMPL_ADD(CV_IMPL_OCL);
4965 LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4966 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4967 allocatedEntries_.push_back(entry);
4970 void _releaseBufferEntry(const BufferEntry& entry)
4972 CV_Assert(entry.capacity_ != 0);
4973 CV_Assert(entry.clBuffer_ != NULL);
4974 LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4975 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4976 Context& ctx = Context::getDefault();
4977 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4978 CV_DbgAssert(svmFns->isValid());
4979 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", entry.clBuffer_);
4980 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4987 template <bool readAccess, bool writeAccess>
4988 class AlignedDataPtr
4992 uchar* const originPtr_;
4993 const size_t alignment_;
4995 uchar* allocatedPtr_;
4998 AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4999 : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
5001 CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5002 CV_DbgAssert(!readAccess || ptr);
5003 if (((size_t)ptr_ & (alignment - 1)) != 0)
5005 allocatedPtr_ = new uchar[size_ + alignment - 1];
5006 ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5009 memcpy(ptr_, originPtr_, size_);
5014 uchar* getAlignedPtr() const
5016 CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5026 memcpy(originPtr_, ptr_, size_);
5028 delete[] allocatedPtr_;
5029 allocatedPtr_ = NULL;
5034 AlignedDataPtr(const AlignedDataPtr&); // disabled
5035 AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5038 template <bool readAccess, bool writeAccess>
5039 class AlignedDataPtr2D
5043 uchar* const originPtr_;
5044 const size_t alignment_;
5046 uchar* allocatedPtr_;
5052 AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5053 : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5055 CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5056 CV_DbgAssert(!readAccess || ptr != NULL);
5057 if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5059 allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5060 ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5063 for (size_t i = 0; i < rows_; i++)
5064 memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5069 uchar* getAlignedPtr() const
5071 CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5081 for (size_t i = 0; i < rows_; i++)
5082 memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5084 delete[] allocatedPtr_;
5085 allocatedPtr_ = NULL;
5090 AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5091 AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5094 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5095 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5099 void Context::Impl::__init_buffer_pools()
5101 bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5102 OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5103 bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5104 OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5106 size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5107 size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5108 bufferPool.setMaxReservedSize(poolSize);
5109 size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5110 bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5112 #ifdef HAVE_OPENCL_SVM
5113 bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5114 OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5115 size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5116 bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5119 CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5122 class OpenCLAllocator CV_FINAL : public MatAllocator
5127 ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5128 ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5129 #ifdef HAVE_OPENCL_SVM
5130 ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5132 ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3 // convertFromBuffer()
5137 matStdAllocator = Mat::getDefaultAllocator();
5141 flushCleanupQueue();
5144 UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5145 AccessFlag flags, UMatUsageFlags usageFlags) const
5147 UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5151 static bool isOpenCLMapForced() // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5153 static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5156 static bool isOpenCLCopyingForced() // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5158 static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5162 void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5164 const Device& dev = ctx.device(0);
5166 if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5167 createFlags |= CL_MEM_ALLOC_HOST_PTR;
5169 if (!isOpenCLCopyingForced() &&
5170 (isOpenCLMapForced() ||
5171 (dev.hostUnifiedMemory()
5178 flags0 = static_cast<UMatData::MemoryFlag>(0);
5180 flags0 = UMatData::COPY_ON_MAP;
5183 UMatData* allocate(int dims, const int* sizes, int type,
5184 void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5187 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5189 flushCleanupQueue();
5191 CV_Assert(data == 0);
5192 size_t total = CV_ELEM_SIZE(type);
5193 for( int i = dims-1; i >= 0; i-- )
5200 Context& ctx = Context::getDefault();
5202 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5203 Context::Impl& ctxImpl = *ctx.getImpl();
5205 int createFlags = 0;
5206 UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5207 getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5209 void* handle = NULL;
5210 int allocatorFlags = 0;
5212 #ifdef HAVE_OPENCL_SVM
5213 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5214 if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5216 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5217 handle = ctxImpl.getBufferPoolSVM().allocate(total);
5219 // this property is constant, so single buffer pool can be used here
5220 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5221 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5225 if (createFlags == 0)
5227 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5228 handle = ctxImpl.getBufferPool().allocate(total);
5230 else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5232 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5233 handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5237 CV_Assert(handle != NULL); // Unsupported, throw
5241 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5243 UMatData* u = new UMatData(this);
5248 u->allocatorFlags_ = allocatorFlags;
5249 u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5250 CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5251 u->markHostCopyObsolete(true);
5252 opencl_allocator_stats.onAllocate(u->size);
5256 bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5261 flushCleanupQueue();
5263 UMatDataAutoLock lock(u);
5267 CV_Assert(u->origdata != 0);
5268 Context& ctx = Context::getDefault();
5269 int createFlags = 0;
5270 UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5271 getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5273 bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5275 cl_context ctx_handle = (cl_context)ctx.ptr();
5276 int allocatorFlags = 0;
5277 UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5278 void* handle = NULL;
5279 cl_int retval = CL_SUCCESS;
5281 #ifdef HAVE_OPENCL_SVM
5282 svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5283 bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5284 if (useSVM && svmCaps.isSupportFineGrainSystem())
5286 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5287 tempUMatFlags = UMatData::TEMP_UMAT;
5288 handle = u->origdata;
5289 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5291 else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5293 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5295 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5297 cl_svm_mem_flags memFlags = createFlags |
5298 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5300 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5301 CV_DbgAssert(svmFns->isValid());
5303 CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5304 handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5307 cl_command_queue q = NULL;
5308 if (!isFineGrainBuffer)
5310 q = (cl_command_queue)Queue::getDefault().ptr();
5311 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5312 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5315 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5318 memcpy(handle, u->origdata, u->size);
5319 if (!isFineGrainBuffer)
5321 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5322 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5323 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5326 tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5327 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5328 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5335 accessFlags &= ~ACCESS_FAST;
5337 tempUMatFlags = UMatData::TEMP_UMAT;
5342 CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5343 // There are OpenCL runtime issues for less aligned data
5344 && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5345 && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5346 // Avoid sharing of host memory between OpenCL buffers
5347 && !(u->originalUMatData && u->originalUMatData->handle)
5350 handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5351 u->size, u->origdata, &retval);
5352 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5353 (long long int)u->size, u->origdata, (void*)handle).c_str());
5355 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5357 handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5358 u->size, u->origdata, &retval);
5359 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5360 (long long int)u->size, u->origdata, (void*)handle).c_str());
5361 tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5364 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5365 if(!handle || retval != CL_SUCCESS)
5368 u->prevAllocator = u->currAllocator;
5369 u->currAllocator = this;
5370 u->flags |= tempUMatFlags | flags0;
5371 u->allocatorFlags_ = allocatorFlags;
5373 if (!!(accessFlags & ACCESS_WRITE))
5374 u->markHostCopyObsolete(true);
5375 opencl_allocator_stats.onAllocate(u->size);
5379 /*void sync(UMatData* u) const
5381 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5382 UMatDataAutoLock lock(u);
5384 if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5386 if( u->tempCopiedUMat() )
5388 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5389 u->size, u->origdata, 0, 0, 0);
5394 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5395 (CL_MAP_READ | CL_MAP_WRITE),
5396 0, u->size, 0, 0, 0, &retval);
5397 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5400 u->markHostCopyObsolete(false);
5402 else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5404 clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5405 u->size, u->data, 0, 0, 0);
5409 void deallocate(UMatData* u) const CV_OVERRIDE
5414 CV_Assert(u->urefcount == 0);
5415 CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5417 CV_Assert(u->handle != 0);
5418 CV_Assert(u->mapcount == 0);
5420 if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5421 addToCleanupQueue(u);
5426 void deallocate_(UMatData* u) const
5429 CV_Assert(u->handle);
5430 if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5432 opencl_allocator_stats.onFree(u->size);
5436 if (cv::__termination) // process is not in consistent state (after ExitProcess call) and terminating
5437 return; // avoid any OpenCL calls
5441 CV_Assert(u->origdata);
5442 // UMatDataAutoLock lock(u);
5444 if (u->hostCopyObsolete())
5446 #ifdef HAVE_OPENCL_SVM
5447 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5449 Context& ctx = Context::getDefault();
5450 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5451 CV_DbgAssert(svmFns->isValid());
5453 if( u->tempCopiedUMat() )
5455 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5456 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5457 bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5458 cl_command_queue q = NULL;
5459 if (!isFineGrainBuffer)
5461 CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5462 q = (cl_command_queue)Queue::getDefault().ptr();
5463 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5464 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5467 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5470 memcpy(u->origdata, u->handle, u->size);
5471 if (!isFineGrainBuffer)
5473 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5474 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5475 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5480 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5487 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5488 if( u->tempCopiedUMat() )
5490 AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5491 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5492 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5499 CV_Assert(u->mapcount == 0);
5500 flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5501 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5502 (CL_MAP_READ | CL_MAP_WRITE),
5503 0, u->size, 0, 0, 0, &retval);
5504 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5505 CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5506 if (u->originalUMatData)
5508 CV_Assert(u->originalUMatData->data == data);
5510 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5511 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5512 CV_OCL_DBG_CHECK(clFinish(q));
5516 u->markHostCopyObsolete(false);
5522 #ifdef HAVE_OPENCL_SVM
5523 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5525 if( u->tempCopiedUMat() )
5527 Context& ctx = Context::getDefault();
5528 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5529 CV_DbgAssert(svmFns->isValid());
5531 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5532 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5538 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5539 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5542 u->markDeviceCopyObsolete(true);
5543 u->currAllocator = u->prevAllocator;
5544 u->prevAllocator = NULL;
5545 if(u->data && u->copyOnMap() && u->data != u->origdata)
5547 u->data = u->origdata;
5548 u->currAllocator->deallocate(u);
5553 CV_Assert(u->origdata == NULL);
5554 if(u->data && u->copyOnMap() && u->data != u->origdata)
5558 u->markHostCopyObsolete(true);
5560 if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5562 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5564 ocl::Context& ctx = *pCtx.get();
5565 CV_Assert(ctx.getImpl());
5566 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5568 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5570 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5572 ocl::Context& ctx = *pCtx.get();
5573 CV_Assert(ctx.getImpl());
5574 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5576 #ifdef HAVE_OPENCL_SVM
5577 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5579 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5581 ocl::Context& ctx = *pCtx.get();
5582 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5586 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5587 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5589 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5590 CV_DbgAssert(svmFns->isValid());
5591 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5593 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5595 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5596 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5597 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5600 CV_Assert(ctx.getImpl());
5601 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5606 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5609 u->markDeviceCopyObsolete(true);
5613 CV_Assert(u == NULL);
5616 // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5617 void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5619 CV_Assert(u && u->handle);
5621 if (!!(accessFlags & ACCESS_WRITE))
5622 u->markDeviceCopyObsolete(true);
5624 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5627 if( !u->copyOnMap() )
5630 // because there can be other map requests for the same UMat with different access flags,
5631 // we use the universal (read-write) access mode.
5632 #ifdef HAVE_OPENCL_SVM
5633 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5635 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5637 Context& ctx = Context::getDefault();
5638 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5639 CV_DbgAssert(svmFns->isValid());
5641 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5643 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5644 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5647 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5648 u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5652 u->data = (uchar*)u->handle;
5653 u->markHostCopyObsolete(false);
5654 u->markDeviceMemMapped(true);
5659 cl_int retval = CL_SUCCESS;
5660 if (!u->deviceMemMapped())
5662 CV_Assert(u->refcount == 1);
5663 CV_Assert(u->mapcount++ == 0);
5664 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5665 (CL_MAP_READ | CL_MAP_WRITE),
5666 0, u->size, 0, 0, 0, &retval);
5667 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5669 if (u->data && retval == CL_SUCCESS)
5671 u->markHostCopyObsolete(false);
5672 u->markDeviceMemMapped(true);
5676 // TODO Is it really a good idea and was it tested well?
5677 // if map failed, switch to copy-on-map mode for the particular buffer
5678 u->flags |= UMatData::COPY_ON_MAP;
5683 u->data = (uchar*)fastMalloc(u->size);
5684 u->markHostCopyObsolete(true);
5688 if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5690 AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5691 #ifdef HAVE_OPENCL_SVM
5692 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5694 cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5695 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5696 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5697 (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5698 u->markHostCopyObsolete(false);
5702 void unmap(UMatData* u) const CV_OVERRIDE
5708 CV_Assert(u->handle != 0);
5710 UMatDataAutoLock autolock(u);
5712 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5714 if( !u->copyOnMap() && u->deviceMemMapped() )
5716 CV_Assert(u->data != NULL);
5717 #ifdef HAVE_OPENCL_SVM
5718 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5720 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5722 Context& ctx = Context::getDefault();
5723 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5724 CV_DbgAssert(svmFns->isValid());
5726 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5728 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5729 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5731 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5733 u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5736 if (u->refcount == 0)
5738 u->markDeviceCopyObsolete(false);
5739 u->markHostCopyObsolete(true);
5743 if (u->refcount == 0)
5745 CV_Assert(u->mapcount-- == 1);
5746 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5747 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5748 if (Device::getDefault().isAMD())
5750 // required for multithreaded applications (see stitching test)
5751 CV_OCL_DBG_CHECK(clFinish(q));
5753 u->markDeviceMemMapped(false);
5755 u->markDeviceCopyObsolete(false);
5756 u->markHostCopyObsolete(true);
5759 else if( u->copyOnMap() && u->deviceCopyObsolete() )
5761 AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5762 #ifdef HAVE_OPENCL_SVM
5763 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5765 retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5766 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5767 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5768 (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5769 u->markDeviceCopyObsolete(false);
5770 u->markHostCopyObsolete(true);
5774 bool checkContinuous(int dims, const size_t sz[],
5775 const size_t srcofs[], const size_t srcstep[],
5776 const size_t dstofs[], const size_t dststep[],
5777 size_t& total, size_t new_sz[],
5778 size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5779 size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5781 bool iscontinuous = true;
5782 srcrawofs = srcofs ? srcofs[dims-1] : 0;
5783 dstrawofs = dstofs ? dstofs[dims-1] : 0;
5785 for( int i = dims-2; i >= 0; i-- )
5787 if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5788 iscontinuous = false;
5791 srcrawofs += srcofs[i]*srcstep[i];
5793 dstrawofs += dstofs[i]*dststep[i];
5798 // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5801 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5802 // we assume that new_... arrays are initialized by caller
5803 // with 0's, so there is no else branch
5806 new_srcofs[0] = srcofs[1];
5807 new_srcofs[1] = srcofs[0];
5813 new_dstofs[0] = dstofs[1];
5814 new_dstofs[1] = dstofs[0];
5818 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5819 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5823 // we could check for dims == 3 here,
5824 // but from user perspective this one is more informative
5825 CV_Assert(dims <= 3);
5826 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5829 new_srcofs[0] = srcofs[2];
5830 new_srcofs[1] = srcofs[1];
5831 new_srcofs[2] = srcofs[0];
5836 new_dstofs[0] = dstofs[2];
5837 new_dstofs[1] = dstofs[1];
5838 new_dstofs[2] = dstofs[0];
5841 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5842 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5845 return iscontinuous;
5848 void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5849 const size_t srcofs[], const size_t srcstep[],
5850 const size_t dststep[]) const CV_OVERRIDE
5854 UMatDataAutoLock autolock(u);
5856 if( u->data && !u->hostCopyObsolete() )
5858 Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5861 CV_Assert( u->handle != 0 );
5863 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5865 size_t total = 0, new_sz[] = {0, 0, 0};
5866 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5867 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5869 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5871 srcrawofs, new_srcofs, new_srcstep,
5872 dstrawofs, new_dstofs, new_dststep);
5874 #ifdef HAVE_OPENCL_SVM
5875 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5877 CV_DbgAssert(u->data == NULL || u->data == u->handle);
5878 Context& ctx = Context::getDefault();
5879 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5880 CV_DbgAssert(svmFns->isValid());
5882 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5883 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5885 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5886 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5889 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5894 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5898 // This code is from MatAllocator::download()
5899 int isz[CV_MAX_DIM];
5900 uchar* srcptr = (uchar*)u->handle;
5901 for( int i = 0; i < dims; i++ )
5903 CV_Assert( sz[i] <= (size_t)INT_MAX );
5907 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5908 isz[i] = (int)sz[i];
5911 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5912 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5914 const Mat* arrays[] = { &src, &dst };
5916 NAryMatIterator it(arrays, ptrs, 2);
5917 size_t j, planesz = it.size;
5919 for( j = 0; j < it.nplanes; j++, ++it )
5920 memcpy(ptrs[1], ptrs[0], planesz);
5922 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5924 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5925 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5927 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5936 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5937 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5938 srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5940 else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5942 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5943 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5944 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5945 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5946 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5947 uchar* ptr = alignedPtr.getAlignedPtr();
5949 CV_Assert(new_srcstep[0] >= new_sz[0]);
5950 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5951 total = std::min(total, u->size - new_srcrawofs);
5952 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5953 new_srcrawofs, total, ptr, 0, 0, 0));
5954 for( size_t i = 0; i < new_sz[1]; i++ )
5955 memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5959 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5960 uchar* ptr = alignedPtr.getAlignedPtr();
5962 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5963 new_srcofs, new_dstofs, new_sz,
5971 void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5972 const size_t dstofs[], const size_t dststep[],
5973 const size_t srcstep[]) const CV_OVERRIDE
5978 // there should be no user-visible CPU copies of the UMat which we are going to copy to
5979 CV_Assert(u->refcount == 0 || u->tempUMat());
5981 size_t total = 0, new_sz[] = {0, 0, 0};
5982 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5983 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5985 bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5987 srcrawofs, new_srcofs, new_srcstep,
5988 dstrawofs, new_dstofs, new_dststep);
5990 UMatDataAutoLock autolock(u);
5992 // if there is cached CPU copy of the GPU matrix,
5993 // we could use it as a destination.
5994 // we can do it in 2 cases:
5995 // 1. we overwrite the whole content
5996 // 2. we overwrite part of the matrix, but the GPU copy is out-of-date
5997 if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5999 Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
6000 u->markHostCopyObsolete(false);
6001 u->markDeviceCopyObsolete(true);
6005 CV_Assert( u->handle != 0 );
6006 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6008 #ifdef HAVE_OPENCL_SVM
6009 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6011 CV_DbgAssert(u->data == NULL || u->data == u->handle);
6012 Context& ctx = Context::getDefault();
6013 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6014 CV_DbgAssert(svmFns->isValid());
6016 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
6017 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6019 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
6020 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
6023 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6028 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6032 // This code is from MatAllocator::upload()
6033 int isz[CV_MAX_DIM];
6034 uchar* dstptr = (uchar*)u->handle;
6035 for( int i = 0; i < dims; i++ )
6037 CV_Assert( sz[i] <= (size_t)INT_MAX );
6041 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6042 isz[i] = (int)sz[i];
6045 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6046 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6048 const Mat* arrays[] = { &src, &dst };
6050 NAryMatIterator it(arrays, ptrs, 2);
6051 size_t j, planesz = it.size;
6053 for( j = 0; j < it.nplanes; j++, ++it )
6054 memcpy(ptrs[1], ptrs[0], planesz);
6056 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6058 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6059 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6061 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6070 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6071 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6072 dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6073 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6074 (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6076 else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6078 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6079 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6080 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6081 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6082 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6083 uchar* ptr = alignedPtr.getAlignedPtr();
6085 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6086 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6087 total = std::min(total, u->size - new_dstrawofs);
6088 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6089 (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6090 (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6091 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6092 new_dstrawofs, total, ptr, 0, 0, 0));
6093 for( size_t i = 0; i < new_sz[1]; i++ )
6094 memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6095 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6096 new_dstrawofs, total, ptr, 0, 0, 0));
6100 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6101 uchar* ptr = alignedPtr.getAlignedPtr();
6103 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6104 new_dstofs, new_srcofs, new_sz,
6110 u->markHostCopyObsolete(true);
6111 #ifdef HAVE_OPENCL_SVM
6112 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6113 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6120 u->markHostCopyObsolete(true);
6122 u->markDeviceCopyObsolete(false);
6125 void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6126 const size_t srcofs[], const size_t srcstep[],
6127 const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6132 size_t total = 0, new_sz[] = {0, 0, 0};
6133 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6134 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6136 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6138 srcrawofs, new_srcofs, new_srcstep,
6139 dstrawofs, new_dstofs, new_dststep);
6141 UMatDataAutoLock src_autolock(src, dst);
6143 if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6145 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6148 if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6150 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6151 dst->markHostCopyObsolete(false);
6152 #ifdef HAVE_OPENCL_SVM
6153 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6154 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6161 dst->markDeviceCopyObsolete(true);
6166 // there should be no user-visible CPU copies of the UMat which we are going to copy to
6167 CV_Assert(dst->refcount == 0);
6168 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6170 cl_int retval = CL_SUCCESS;
6171 #ifdef HAVE_OPENCL_SVM
6172 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6173 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6175 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6176 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6178 Context& ctx = Context::getDefault();
6179 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6180 CV_DbgAssert(svmFns->isValid());
6184 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6185 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6186 cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6187 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6188 total, 0, NULL, NULL);
6189 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6194 // This code is from MatAllocator::download()/upload()
6195 int isz[CV_MAX_DIM];
6196 uchar* srcptr = (uchar*)src->handle;
6197 for( int i = 0; i < dims; i++ )
6199 CV_Assert( sz[i] <= (size_t)INT_MAX );
6203 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6204 isz[i] = (int)sz[i];
6206 Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6208 uchar* dstptr = (uchar*)dst->handle;
6209 for( int i = 0; i < dims; i++ )
6212 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6214 Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6216 const Mat* arrays[] = { &m_src, &m_dst };
6218 NAryMatIterator it(arrays, ptrs, 2);
6219 size_t j, planesz = it.size;
6221 for( j = 0; j < it.nplanes; j++, ++it )
6222 memcpy(ptrs[1], ptrs[0], planesz);
6227 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6229 map(src, ACCESS_READ);
6230 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6235 map(dst, ACCESS_WRITE);
6236 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6246 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6247 srcrawofs, dstrawofs, total, 0, 0, 0);
6248 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6249 (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6251 else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6253 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6254 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6255 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6256 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6257 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6259 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6260 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6261 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6262 CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6263 uchar* srcptr = srcBuf.getAlignedPtr();
6264 uchar* dstptr = dstBuf.getAlignedPtr();
6266 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6268 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6269 src_total = std::min(src_total, src->size - new_srcrawofs);
6270 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6271 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6273 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6274 new_srcrawofs, src_total, srcptr, 0, 0, 0));
6275 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6276 new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6278 for( size_t i = 0; i < new_sz[1]; i++ )
6279 memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6280 srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6281 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6282 new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6286 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6287 new_srcofs, new_dstofs, new_sz,
6293 if (retval == CL_SUCCESS)
6295 CV_IMPL_ADD(CV_IMPL_OCL)
6298 #ifdef HAVE_OPENCL_SVM
6299 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6300 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6307 dst->markHostCopyObsolete(true);
6309 dst->markDeviceCopyObsolete(false);
6313 CV_OCL_DBG_CHECK(clFinish(q));
6317 BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6319 ocl::Context ctx = Context::getDefault();
6322 #ifdef HAVE_OPENCL_SVM
6323 if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6325 return &ctx.getImpl()->getBufferPoolSVM();
6328 if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6330 return &ctx.getImpl()->getBufferPoolHostPtr();
6332 if (id != NULL && strcmp(id, "OCL") != 0)
6334 CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6336 return &ctx.getImpl()->getBufferPool();
6339 MatAllocator* matStdAllocator;
6341 mutable cv::Mutex cleanupQueueMutex;
6342 mutable std::deque<UMatData*> cleanupQueue;
6344 void flushCleanupQueue() const
6346 if (!cleanupQueue.empty())
6348 std::deque<UMatData*> q;
6350 cv::AutoLock lock(cleanupQueueMutex);
6351 q.swap(cleanupQueue);
6353 for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6359 void addToCleanupQueue(UMatData* u) const
6361 //TODO: Validation check: CV_Assert(!u->tempUMat());
6363 cv::AutoLock lock(cleanupQueueMutex);
6364 cleanupQueue.push_back(u);
6369 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6371 static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6372 g_isOpenCVActivated = true;
6375 MatAllocator* getOpenCLAllocator()
6377 CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6380 }} // namespace cv::ocl
6385 // three funcs below are implemented in umatrix.cpp
6386 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6387 bool autoSteps = false );
6388 void finalizeHdr(UMat& m);
6393 namespace cv { namespace ocl {
6396 // Convert OpenCL buffer memory to UMat
6398 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6401 int sizes[] = { rows, cols };
6403 CV_Assert(0 <= d && d <= CV_MAX_DIM);
6407 dst.flags = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6408 dst.usageFlags = USAGE_DEFAULT;
6410 setSize(dst, d, sizes, 0, true);
6413 cl_mem memobj = (cl_mem)cl_mem_buffer;
6414 cl_mem_object_type mem_type = 0;
6416 CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6418 CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6421 CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6423 CV_OCL_CHECK(clRetainMemObject(memobj));
6425 CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6426 CV_Assert(total >= rows * step);
6428 // attach clBuffer to UMatData
6429 dst.u = new UMatData(getOpenCLAllocator());
6431 dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER; // not allocated from any OpenCV buffer pool
6432 dst.u->flags = static_cast<UMatData::MemoryFlag>(0);
6433 dst.u->handle = cl_mem_buffer;
6434 dst.u->origdata = 0;
6435 dst.u->prevAllocator = 0;
6436 dst.u->size = total;
6442 } // convertFromBuffer()
6446 // Convert OpenCL image2d_t memory to UMat
6448 void convertFromImage(void* cl_mem_image, UMat& dst)
6450 cl_mem clImage = (cl_mem)cl_mem_image;
6451 cl_mem_object_type mem_type = 0;
6453 CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6455 CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6457 cl_image_format fmt = { 0, 0 };
6458 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6461 switch (fmt.image_channel_data_type)
6464 case CL_UNSIGNED_INT8:
6469 case CL_SIGNED_INT8:
6473 case CL_UNORM_INT16:
6474 case CL_UNSIGNED_INT16:
6478 case CL_SNORM_INT16:
6479 case CL_SIGNED_INT16:
6483 case CL_SIGNED_INT32:
6492 CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6496 switch (fmt.image_channel_order)
6499 type = CV_MAKE_TYPE(depth, 1);
6505 type = CV_MAKE_TYPE(depth, 4);
6509 CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6514 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6517 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6520 CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6522 dst.create((int)h, (int)w, type);
6524 cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6526 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6529 size_t src_origin[3] = { 0, 0, 0 };
6530 size_t region[3] = { w, h, 1 };
6531 CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6533 CV_OCL_CHECK(clFinish(q));
6536 } // convertFromImage()
6539 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6541 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6543 cl_uint numDevices = 0;
6544 cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6545 if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6547 CV_OCL_DBG_CHECK_RESULT(status,
6548 cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6551 if (numDevices == 0)
6557 devices.resize((size_t)numDevices);
6558 CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6561 struct PlatformInfo::Impl
6566 handle = *(cl_platform_id*)id;
6567 getDevices(devices, handle);
6570 String getStrProp(cl_platform_info prop) const
6574 return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6575 sz < sizeof(buf) ? String(buf) : String();
6578 IMPLEMENT_REFCOUNTABLE();
6579 std::vector<cl_device_id> devices;
6580 cl_platform_id handle;
6583 PlatformInfo::PlatformInfo()
6588 PlatformInfo::PlatformInfo(void* platform_id)
6590 p = new Impl(platform_id);
6593 PlatformInfo::~PlatformInfo()
6599 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6606 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6619 int PlatformInfo::deviceNumber() const
6621 return p ? (int)p->devices.size() : 0;
6624 void PlatformInfo::getDevice(Device& device, int d) const
6626 CV_Assert(p && d < (int)p->devices.size() );
6628 device.set(p->devices[d]);
6631 String PlatformInfo::name() const
6633 return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6636 String PlatformInfo::vendor() const
6638 return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6641 String PlatformInfo::version() const
6643 return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6646 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6648 cl_uint numPlatforms = 0;
6649 CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6651 if (numPlatforms == 0)
6657 platforms.resize((size_t)numPlatforms);
6658 CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6661 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6663 std::vector<cl_platform_id> platforms;
6664 getPlatforms(platforms);
6666 for (size_t i = 0; i < platforms.size(); i++)
6667 platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6670 const char* typeToStr(int type)
6672 static const char* tab[]=
6674 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6675 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6676 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6677 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6678 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6679 "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6680 "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6681 "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6682 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6684 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6685 const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6690 const char* memopTypeToStr(int type)
6692 static const char* tab[] =
6694 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6695 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6696 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6697 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6698 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6699 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6700 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6701 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6704 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6705 const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6710 const char* vecopTypeToStr(int type)
6712 static const char* tab[] =
6714 "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6715 "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6716 "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6717 "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6718 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6719 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6720 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6721 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6722 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6724 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6725 const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6730 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6732 if( sdepth == ddepth )
6734 const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6735 if( ddepth >= CV_32F ||
6736 (ddepth == CV_32S && sdepth < CV_32S) ||
6737 (ddepth == CV_16S && sdepth <= CV_8S) ||
6738 (ddepth == CV_16U && sdepth == CV_8U))
6740 sprintf(buf, "convert_%s", typestr);
6742 else if( sdepth >= CV_32F )
6743 sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6745 sprintf(buf, "convert_%s_sat", typestr);
6750 const char* getOpenCLErrorString(int errorCode)
6752 #define CV_OCL_CODE(id) case id: return #id
6753 #define CV_OCL_CODE_(id, name) case id: return #name
6756 CV_OCL_CODE(CL_SUCCESS);
6757 CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6758 CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6759 CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6760 CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6761 CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6762 CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6763 CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6764 CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6765 CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6766 CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6767 CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6768 CV_OCL_CODE(CL_MAP_FAILURE);
6769 CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6770 CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6771 CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6772 CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6773 CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6774 CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6775 CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6776 CV_OCL_CODE(CL_INVALID_VALUE);
6777 CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6778 CV_OCL_CODE(CL_INVALID_PLATFORM);
6779 CV_OCL_CODE(CL_INVALID_DEVICE);
6780 CV_OCL_CODE(CL_INVALID_CONTEXT);
6781 CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6782 CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6783 CV_OCL_CODE(CL_INVALID_HOST_PTR);
6784 CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6785 CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6786 CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6787 CV_OCL_CODE(CL_INVALID_SAMPLER);
6788 CV_OCL_CODE(CL_INVALID_BINARY);
6789 CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6790 CV_OCL_CODE(CL_INVALID_PROGRAM);
6791 CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6792 CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6793 CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6794 CV_OCL_CODE(CL_INVALID_KERNEL);
6795 CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6796 CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6797 CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6798 CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6799 CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6800 CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6801 CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6802 CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6803 CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6804 CV_OCL_CODE(CL_INVALID_EVENT);
6805 CV_OCL_CODE(CL_INVALID_OPERATION);
6806 CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6807 CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6808 CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6809 CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6811 CV_OCL_CODE(CL_INVALID_PROPERTY);
6813 CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6814 CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6815 CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6816 CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6818 CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6819 CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6821 CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6822 CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6823 CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6824 CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6825 CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6826 CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6827 default: return "Unknown OpenCL error";
6833 template <typename T>
6834 static std::string kerToStr(const Mat & k)
6836 int width = k.cols - 1, depth = k.depth();
6837 const T * const data = k.ptr<T>();
6839 std::ostringstream stream;
6840 stream.precision(10);
6844 for (int i = 0; i < width; ++i)
6845 stream << "DIG(" << (int)data[i] << ")";
6846 stream << "DIG(" << (int)data[width] << ")";
6848 else if (depth == CV_32F)
6850 stream.setf(std::ios_base::showpoint);
6851 for (int i = 0; i < width; ++i)
6852 stream << "DIG(" << data[i] << "f)";
6853 stream << "DIG(" << data[width] << "f)";
6857 for (int i = 0; i < width; ++i)
6858 stream << "DIG(" << data[i] << ")";
6859 stream << "DIG(" << data[width] << ")";
6862 return stream.str();
6865 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6867 Mat kernel = _kernel.getMat().reshape(1, 1);
6869 int depth = kernel.depth();
6873 if (ddepth != depth)
6874 kernel.convertTo(kernel, ddepth);
6876 typedef std::string (* func_t)(const Mat &);
6877 static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6878 kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6879 const func_t func = funcs[ddepth];
6880 CV_Assert(func != 0);
6882 return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6885 #define PROCESS_SRC(src) \
6890 CV_Assert(src.isMat() || src.isUMat()); \
6891 Size csize = src.size(); \
6892 int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6893 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6894 if (cwidth < ckercn || ckercn <= 0) \
6896 cols.push_back(cwidth); \
6897 if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6899 offsets.push_back(src.offset()); \
6900 steps.push_back(src.step()); \
6901 dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6902 kercns.push_back(ckercn); \
6907 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6908 InputArray src4, InputArray src5, InputArray src6,
6909 InputArray src7, InputArray src8, InputArray src9,
6910 OclVectorStrategy strat)
6912 const ocl::Device & d = ocl::Device::getDefault();
6914 int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6915 d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6916 d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6917 d.preferredVectorWidthDouble(), -1 };
6919 // if the device says don't use vectors
6920 if (vectorWidths[0] == 1)
6923 vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6924 vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6925 vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6928 return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6931 int checkOptimalVectorWidth(const int *vectorWidths,
6932 InputArray src1, InputArray src2, InputArray src3,
6933 InputArray src4, InputArray src5, InputArray src6,
6934 InputArray src7, InputArray src8, InputArray src9,
6935 OclVectorStrategy strat)
6937 CV_Assert(vectorWidths);
6939 int ref_type = src1.type();
6941 std::vector<size_t> offsets, steps, cols;
6942 std::vector<int> dividers, kercns;
6953 size_t size = offsets.size();
6955 for (size_t i = 0; i < size; ++i)
6956 while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6957 dividers[i] >>= 1, kercns[i] >>= 1;
6960 int kercn = *std::min_element(kercns.begin(), kercns.end());
6965 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6966 InputArray src4, InputArray src5, InputArray src6,
6967 InputArray src7, InputArray src8, InputArray src9)
6969 return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6975 // TODO Make this as a method of OpenCL "BuildOptions" class
6976 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6978 if (!buildOptions.empty())
6979 buildOptions += " ";
6980 int type = _m.type(), depth = CV_MAT_DEPTH(type);
6981 buildOptions += format(
6982 "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6983 name.c_str(), ocl::typeToStr(type),
6984 name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6985 name.c_str(), (int)CV_MAT_CN(type),
6986 name.c_str(), (int)CV_ELEM_SIZE(type),
6987 name.c_str(), (int)CV_ELEM_SIZE1(type),
6988 name.c_str(), (int)depth
6993 struct Image2D::Impl
6995 Impl(const UMat &src, bool norm, bool alias)
6999 init(src, norm, alias);
7005 clReleaseMemObject(handle);
7008 static cl_image_format getImageFormat(int depth, int cn, bool norm)
7010 cl_image_format format;
7011 static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
7012 CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
7013 static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
7014 CL_SNORM_INT16, -1, -1, -1, -1 };
7015 static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
7017 int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
7018 int channelOrder = channelOrders[cn];
7019 format.image_channel_data_type = (cl_channel_type)channelType;
7020 format.image_channel_order = (cl_channel_order)channelOrder;
7024 static bool isFormatSupported(cl_image_format format)
7027 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7029 cl_context context = (cl_context)Context::getDefault().ptr();
7033 // Figure out how many formats are supported by this context.
7034 cl_uint numFormats = 0;
7035 cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7036 CL_MEM_OBJECT_IMAGE2D, numFormats,
7038 CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7041 AutoBuffer<cl_image_format> formats(numFormats);
7042 err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7043 CL_MEM_OBJECT_IMAGE2D, numFormats,
7044 formats.data(), NULL);
7045 CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7046 for (cl_uint i = 0; i < numFormats; ++i)
7048 if (!memcmp(&formats[i], &format, sizeof(format)))
7057 void init(const UMat &src, bool norm, bool alias)
7060 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7062 CV_Assert(!src.empty());
7063 CV_Assert(ocl::Device::getDefault().imageSupport());
7065 int err, depth = src.depth(), cn = src.channels();
7067 cl_image_format format = getImageFormat(depth, cn, norm);
7069 if (!isFormatSupported(format))
7070 CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7072 if (alias && !src.handle(ACCESS_RW))
7073 CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7075 cl_context context = (cl_context)Context::getDefault().ptr();
7076 cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7078 #ifdef CL_VERSION_1_2
7079 // this enables backwards portability to
7080 // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7081 const Device & d = ocl::Device::getDefault();
7082 int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7083 CV_Assert(!alias || canCreateAlias(src));
7084 if (1 < major || (1 == major && 2 <= minor))
7087 desc.image_type = CL_MEM_OBJECT_IMAGE2D;
7088 desc.image_width = src.cols;
7089 desc.image_height = src.rows;
7090 desc.image_depth = 0;
7091 desc.image_array_size = 1;
7092 desc.image_row_pitch = alias ? src.step[0] : 0;
7093 desc.image_slice_pitch = 0;
7094 desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7095 desc.num_mip_levels = 0;
7096 desc.num_samples = 0;
7097 handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7102 CV_SUPPRESS_DEPRECATED_START
7103 CV_Assert(!alias); // This is an OpenCL 1.2 extension
7104 handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7105 CV_SUPPRESS_DEPRECATED_END
7107 CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7109 size_t origin[] = { 0, 0, 0 };
7110 size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7113 if (!alias && !src.isContinuous())
7115 devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7116 CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7117 (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7120 const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7121 CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7122 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7123 CV_OCL_DBG_CHECK(clFlush(queue));
7127 devData = (cl_mem)src.handle(ACCESS_READ);
7129 CV_Assert(devData != NULL);
7133 CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7134 if (!src.isContinuous())
7136 CV_OCL_DBG_CHECK(clFlush(queue));
7137 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7142 IMPLEMENT_REFCOUNTABLE();
7152 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7154 p = new Impl(src, norm, alias);
7157 bool Image2D::canCreateAlias(const UMat &m)
7160 const Device & d = ocl::Device::getDefault();
7161 if (d.imageFromBufferSupport() && !m.empty())
7163 // This is the required pitch alignment in pixels
7164 uint pitchAlign = d.imagePitchAlignment();
7165 if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7167 // We don't currently handle the case where the buffer was created
7168 // with CL_MEM_USE_HOST_PTR
7169 if (!m.u->tempUMat())
7178 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7180 cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7182 return Impl::isFormatSupported(format);
7185 Image2D::Image2D(const Image2D & i)
7192 Image2D & Image2D::operator = (const Image2D & i)
7211 void* Image2D::ptr() const
7213 return p ? p->handle : 0;
7216 bool internal::isOpenCLForced()
7218 static bool initialized = false;
7219 static bool value = false;
7222 value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7228 bool internal::isPerformanceCheckBypassed()
7230 static bool initialized = false;
7231 static bool value = false;
7234 value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7240 bool internal::isCLBuffer(UMat& u)
7242 void* h = u.handle(ACCESS_RW);
7245 CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7247 if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7250 cl_mem_object_type type = 0;
7251 cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7252 if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7262 Impl(const Queue& q)
7271 CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7277 CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7281 uint64 durationNS() const
7283 return (uint64)(timer.getTimeSec() * 1e9);
7289 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7290 Timer::~Timer() { delete p; }
7304 uint64 Timer::durationNS() const
7307 return p->durationNS();
7313 namespace cv { namespace directx { namespace internal {
7314 OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
7316 ocl::Context::Impl* i = ctx.getImpl();
7318 return i->getDirectXImpl();
7320 }}} // namespace cv::directx::internal
7323 #endif // HAVE_OPENCL