Merge pull request #19137 from VadimLevin:dev/vlevin/safe-string-conversion
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832 // true if we have initialized OpenCL subsystem with available platforms
833 static bool g_isOpenCVActivated = false;
834
835 bool haveOpenCL()
836 {
837     CV_TRACE_FUNCTION();
838     static bool g_isOpenCLInitialized = false;
839     static bool g_isOpenCLAvailable = false;
840
841     if (!g_isOpenCLInitialized)
842     {
843         CV_TRACE_REGION("Init_OpenCL_Runtime");
844         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
845         if (envPath)
846         {
847             if (cv::String(envPath) == "disabled")
848             {
849                 g_isOpenCLAvailable = false;
850                 g_isOpenCLInitialized = true;
851             }
852         }
853         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
854         try
855         {
856             cl_uint n = 0;
857             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
858             g_isOpenCVActivated = n > 0;
859         }
860         catch (...)
861         {
862             g_isOpenCLAvailable = false;
863         }
864         g_isOpenCLInitialized = true;
865     }
866     return g_isOpenCLAvailable;
867 }
868
869 bool useOpenCL()
870 {
871     CoreTLSData& data = getCoreTlsData();
872     if (data.useOpenCL < 0)
873     {
874         try
875         {
876             data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
877         }
878         catch (...)
879         {
880             data.useOpenCL = 0;
881         }
882     }
883     return data.useOpenCL > 0;
884 }
885
886 bool isOpenCLActivated()
887 {
888     if (!g_isOpenCVActivated)
889         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
890     return useOpenCL();
891 }
892
893 void setUseOpenCL(bool flag)
894 {
895     CV_TRACE_FUNCTION();
896
897     CoreTLSData& data = getCoreTlsData();
898     if (!flag)
899     {
900         data.useOpenCL = 0;
901     }
902     else if( haveOpenCL() )
903     {
904         data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
905     }
906 }
907
908 #ifdef HAVE_CLAMDBLAS
909
910 class AmdBlasHelper
911 {
912 public:
913     static AmdBlasHelper & getInstance()
914     {
915         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
916     }
917
918     bool isAvailable() const
919     {
920         return g_isAmdBlasAvailable;
921     }
922
923     ~AmdBlasHelper()
924     {
925         try
926         {
927             clAmdBlasTeardown();
928         }
929         catch (...) { }
930     }
931
932 protected:
933     AmdBlasHelper()
934     {
935         if (!g_isAmdBlasInitialized)
936         {
937             AutoLock lock(getInitializationMutex());
938
939             if (!g_isAmdBlasInitialized)
940             {
941                 if (haveOpenCL())
942                 {
943                     try
944                     {
945                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
946                     }
947                     catch (...)
948                     {
949                         g_isAmdBlasAvailable = false;
950                     }
951                 }
952                 else
953                     g_isAmdBlasAvailable = false;
954
955                 g_isAmdBlasInitialized = true;
956             }
957         }
958     }
959
960 private:
961     static bool g_isAmdBlasInitialized;
962     static bool g_isAmdBlasAvailable;
963 };
964
965 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
966 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
967
968 bool haveAmdBlas()
969 {
970     return AmdBlasHelper::getInstance().isAvailable();
971 }
972
973 #else
974
975 bool haveAmdBlas()
976 {
977     return false;
978 }
979
980 #endif
981
982 #ifdef HAVE_CLAMDFFT
983
984 class AmdFftHelper
985 {
986 public:
987     static AmdFftHelper & getInstance()
988     {
989         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
990     }
991
992     bool isAvailable() const
993     {
994         return g_isAmdFftAvailable;
995     }
996
997     ~AmdFftHelper()
998     {
999         try
1000         {
1001 //            clAmdFftTeardown();
1002         }
1003         catch (...) { }
1004     }
1005
1006 protected:
1007     AmdFftHelper()
1008     {
1009         if (!g_isAmdFftInitialized)
1010         {
1011             AutoLock lock(getInitializationMutex());
1012
1013             if (!g_isAmdFftInitialized)
1014             {
1015                 if (haveOpenCL())
1016                 {
1017                     try
1018                     {
1019                         cl_uint major, minor, patch;
1020                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1021
1022                         // it throws exception in case AmdFft binaries are not found
1023                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1024                         g_isAmdFftAvailable = true;
1025                     }
1026                     catch (const Exception &)
1027                     {
1028                         g_isAmdFftAvailable = false;
1029                     }
1030                 }
1031                 else
1032                     g_isAmdFftAvailable = false;
1033
1034                 g_isAmdFftInitialized = true;
1035             }
1036         }
1037     }
1038
1039 private:
1040     static clAmdFftSetupData setupData;
1041     static bool g_isAmdFftInitialized;
1042     static bool g_isAmdFftAvailable;
1043 };
1044
1045 clAmdFftSetupData AmdFftHelper::setupData;
1046 bool AmdFftHelper::g_isAmdFftAvailable = false;
1047 bool AmdFftHelper::g_isAmdFftInitialized = false;
1048
1049 bool haveAmdFft()
1050 {
1051     return AmdFftHelper::getInstance().isAvailable();
1052 }
1053
1054 #else
1055
1056 bool haveAmdFft()
1057 {
1058     return false;
1059 }
1060
1061 #endif
1062
1063 bool haveSVM()
1064 {
1065 #ifdef HAVE_OPENCL_SVM
1066     return true;
1067 #else
1068     return false;
1069 #endif
1070 }
1071
1072 void finish()
1073 {
1074     Queue::getDefault().finish();
1075 }
1076
1077 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1078
1079 struct Platform::Impl
1080 {
1081     Impl()
1082     {
1083         refcount = 1;
1084         handle = 0;
1085         initialized = false;
1086     }
1087
1088     ~Impl() {}
1089
1090     void init()
1091     {
1092         if( !initialized )
1093         {
1094             //cl_uint num_entries
1095             cl_uint n = 0;
1096             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1097                 handle = 0;
1098             if( handle != 0 )
1099             {
1100                 char buf[1000];
1101                 size_t len = 0;
1102                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1103                 buf[len] = '\0';
1104                 vendor = String(buf);
1105             }
1106
1107             initialized = true;
1108         }
1109     }
1110
1111     IMPLEMENT_REFCOUNTABLE();
1112
1113     cl_platform_id handle;
1114     String vendor;
1115     bool initialized;
1116 };
1117
1118 Platform::Platform()
1119 {
1120     p = 0;
1121 }
1122
1123 Platform::~Platform()
1124 {
1125     if(p)
1126         p->release();
1127 }
1128
1129 Platform::Platform(const Platform& pl)
1130 {
1131     p = (Impl*)pl.p;
1132     if(p)
1133         p->addref();
1134 }
1135
1136 Platform& Platform::operator = (const Platform& pl)
1137 {
1138     Impl* newp = (Impl*)pl.p;
1139     if(newp)
1140         newp->addref();
1141     if(p)
1142         p->release();
1143     p = newp;
1144     return *this;
1145 }
1146
1147 void* Platform::ptr() const
1148 {
1149     return p ? p->handle : 0;
1150 }
1151
1152 Platform& Platform::getDefault()
1153 {
1154     static Platform p;
1155     if( !p.p )
1156     {
1157         p.p = new Impl;
1158         p.p->init();
1159     }
1160     return p;
1161 }
1162
1163 /////////////////////////////////////// Device ////////////////////////////////////////////
1164
1165 // Version has format:
1166 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1167 // by specification
1168 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1169 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1170 //   https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html
1171 //   https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html
1172 static void parseOpenCLVersion(const String &version, int &major, int &minor)
1173 {
1174     major = minor = 0;
1175     if (10 >= version.length())
1176         return;
1177     const char *pstr = version.c_str();
1178     if (0 != strncmp(pstr, "OpenCL ", 7))
1179         return;
1180     size_t ppos = version.find('.', 7);
1181     if (String::npos == ppos)
1182         return;
1183     String temp = version.substr(7, ppos - 7);
1184     major = atoi(temp.c_str());
1185     temp = version.substr(ppos + 1);
1186     minor = atoi(temp.c_str());
1187 }
1188
1189 struct Device::Impl
1190 {
1191     Impl(void* d)
1192     {
1193         handle = (cl_device_id)d;
1194         refcount = 1;
1195
1196         name_ = getStrProp(CL_DEVICE_NAME);
1197         version_ = getStrProp(CL_DEVICE_VERSION);
1198         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1199         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1200         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1201         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1202         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1203         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1204         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1205         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1206
1207         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1208         parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1209
1210         size_t pos = 0;
1211         while (pos < extensions_.size())
1212         {
1213             size_t pos2 = extensions_.find(' ', pos);
1214             if (pos2 == String::npos)
1215                 pos2 = extensions_.size();
1216             if (pos2 > pos)
1217             {
1218                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1219                 extensions_set_.insert(extensionName);
1220             }
1221             pos = pos2 + 1;
1222         }
1223
1224         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1225
1226         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1227         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1228             vendorName_ == "AMD")
1229             vendorID_ = VENDOR_AMD;
1230         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1231             vendorID_ = VENDOR_INTEL;
1232         else if (vendorName_ == "NVIDIA Corporation")
1233             vendorID_ = VENDOR_NVIDIA;
1234         else
1235             vendorID_ = UNKNOWN_VENDOR;
1236
1237         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1238         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1239         {
1240             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1241             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1242                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1243             maxWorkGroupSize_ = new_maxWorkGroupSize;
1244         }
1245 #if 0
1246         if (isExtensionSupported("cl_khr_spir"))
1247         {
1248 #ifndef CL_DEVICE_SPIR_VERSIONS
1249 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1250 #endif
1251             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1252             std::cout << spir_versions << std::endl;
1253         }
1254 #endif
1255     }
1256
1257     template<typename _TpCL, typename _TpOut>
1258     _TpOut getProp(cl_device_info prop) const
1259     {
1260         _TpCL temp=_TpCL();
1261         size_t sz = 0;
1262
1263         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1264             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1265     }
1266
1267     bool getBoolProp(cl_device_info prop) const
1268     {
1269         cl_bool temp = CL_FALSE;
1270         size_t sz = 0;
1271
1272         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1273             sz == sizeof(temp) ? temp != 0 : false;
1274     }
1275
1276     String getStrProp(cl_device_info prop) const
1277     {
1278         char buf[4096];
1279         size_t sz=0;
1280         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1281             sz < sizeof(buf) ? String(buf) : String();
1282     }
1283
1284     bool isExtensionSupported(const std::string& extensionName) const
1285     {
1286         return extensions_set_.count(extensionName) > 0;
1287     }
1288
1289
1290     IMPLEMENT_REFCOUNTABLE();
1291
1292     cl_device_id handle;
1293
1294     String name_;
1295     String version_;
1296     std::string extensions_;
1297     int doubleFPConfig_;
1298     bool hostUnifiedMemory_;
1299     int maxComputeUnits_;
1300     size_t maxWorkGroupSize_;
1301     int type_;
1302     int addressBits_;
1303     int deviceVersionMajor_;
1304     int deviceVersionMinor_;
1305     String driverVersion_;
1306     String vendorName_;
1307     int vendorID_;
1308     bool intelSubgroupsSupport_;
1309
1310     std::set<std::string> extensions_set_;
1311 };
1312
1313
1314 Device::Device()
1315 {
1316     p = 0;
1317 }
1318
1319 Device::Device(void* d)
1320 {
1321     p = 0;
1322     set(d);
1323 }
1324
1325 Device::Device(const Device& d)
1326 {
1327     p = d.p;
1328     if(p)
1329         p->addref();
1330 }
1331
1332 Device& Device::operator = (const Device& d)
1333 {
1334     Impl* newp = (Impl*)d.p;
1335     if(newp)
1336         newp->addref();
1337     if(p)
1338         p->release();
1339     p = newp;
1340     return *this;
1341 }
1342
1343 Device::~Device()
1344 {
1345     if(p)
1346         p->release();
1347 }
1348
1349 void Device::set(void* d)
1350 {
1351     if(p)
1352         p->release();
1353     p = new Impl(d);
1354 }
1355
1356 void* Device::ptr() const
1357 {
1358     return p ? p->handle : 0;
1359 }
1360
1361 String Device::name() const
1362 { return p ? p->name_ : String(); }
1363
1364 String Device::extensions() const
1365 { return p ? String(p->extensions_) : String(); }
1366
1367 bool Device::isExtensionSupported(const String& extensionName) const
1368 { return p ? p->isExtensionSupported(extensionName) : false; }
1369
1370 String Device::version() const
1371 { return p ? p->version_ : String(); }
1372
1373 String Device::vendorName() const
1374 { return p ? p->vendorName_ : String(); }
1375
1376 int Device::vendorID() const
1377 { return p ? p->vendorID_ : 0; }
1378
1379 String Device::OpenCL_C_Version() const
1380 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1381
1382 String Device::OpenCLVersion() const
1383 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1384
1385 int Device::deviceVersionMajor() const
1386 { return p ? p->deviceVersionMajor_ : 0; }
1387
1388 int Device::deviceVersionMinor() const
1389 { return p ? p->deviceVersionMinor_ : 0; }
1390
1391 String Device::driverVersion() const
1392 { return p ? p->driverVersion_ : String(); }
1393
1394 int Device::type() const
1395 { return p ? p->type_ : 0; }
1396
1397 int Device::addressBits() const
1398 { return p ? p->addressBits_ : 0; }
1399
1400 bool Device::available() const
1401 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1402
1403 bool Device::compilerAvailable() const
1404 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1405
1406 bool Device::linkerAvailable() const
1407 #ifdef CL_VERSION_1_2
1408 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1409 #else
1410 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1411 #endif
1412
1413 int Device::doubleFPConfig() const
1414 { return p ? p->doubleFPConfig_ : 0; }
1415
1416 int Device::singleFPConfig() const
1417 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1418
1419 int Device::halfFPConfig() const
1420 #ifdef CL_VERSION_1_2
1421 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1422 #else
1423 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1424 #endif
1425
1426 bool Device::endianLittle() const
1427 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1428
1429 bool Device::errorCorrectionSupport() const
1430 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1431
1432 int Device::executionCapabilities() const
1433 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1434
1435 size_t Device::globalMemCacheSize() const
1436 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1437
1438 int Device::globalMemCacheType() const
1439 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1440
1441 int Device::globalMemCacheLineSize() const
1442 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1443
1444 size_t Device::globalMemSize() const
1445 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1446
1447 size_t Device::localMemSize() const
1448 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1449
1450 int Device::localMemType() const
1451 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1452
1453 bool Device::hostUnifiedMemory() const
1454 { return p ? p->hostUnifiedMemory_ : false; }
1455
1456 bool Device::imageSupport() const
1457 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1458
1459 bool Device::imageFromBufferSupport() const
1460 {
1461     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1462 }
1463
1464 uint Device::imagePitchAlignment() const
1465 {
1466 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1467     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1468 #else
1469     return 0;
1470 #endif
1471 }
1472
1473 uint Device::imageBaseAddressAlignment() const
1474 {
1475 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1476     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1477 #else
1478     return 0;
1479 #endif
1480 }
1481
1482 size_t Device::image2DMaxWidth() const
1483 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1484
1485 size_t Device::image2DMaxHeight() const
1486 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1487
1488 size_t Device::image3DMaxWidth() const
1489 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1490
1491 size_t Device::image3DMaxHeight() const
1492 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1493
1494 size_t Device::image3DMaxDepth() const
1495 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1496
1497 size_t Device::imageMaxBufferSize() const
1498 #ifdef CL_VERSION_1_2
1499 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1500 #else
1501 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1502 #endif
1503
1504 size_t Device::imageMaxArraySize() const
1505 #ifdef CL_VERSION_1_2
1506 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1507 #else
1508 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1509 #endif
1510
1511 bool Device::intelSubgroupsSupport() const
1512 { return p ? p->intelSubgroupsSupport_ : false; }
1513
1514 int Device::maxClockFrequency() const
1515 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1516
1517 int Device::maxComputeUnits() const
1518 { return p ? p->maxComputeUnits_ : 0; }
1519
1520 int Device::maxConstantArgs() const
1521 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1522
1523 size_t Device::maxConstantBufferSize() const
1524 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1525
1526 size_t Device::maxMemAllocSize() const
1527 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1528
1529 size_t Device::maxParameterSize() const
1530 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1531
1532 int Device::maxReadImageArgs() const
1533 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1534
1535 int Device::maxWriteImageArgs() const
1536 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1537
1538 int Device::maxSamplers() const
1539 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1540
1541 size_t Device::maxWorkGroupSize() const
1542 { return p ? p->maxWorkGroupSize_ : 0; }
1543
1544 int Device::maxWorkItemDims() const
1545 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1546
1547 void Device::maxWorkItemSizes(size_t* sizes) const
1548 {
1549     if(p)
1550     {
1551         const int MAX_DIMS = 32;
1552         size_t retsz = 0;
1553         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1554                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1555     }
1556 }
1557
1558 int Device::memBaseAddrAlign() const
1559 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1560
1561 int Device::nativeVectorWidthChar() const
1562 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1563
1564 int Device::nativeVectorWidthShort() const
1565 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1566
1567 int Device::nativeVectorWidthInt() const
1568 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1569
1570 int Device::nativeVectorWidthLong() const
1571 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1572
1573 int Device::nativeVectorWidthFloat() const
1574 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1575
1576 int Device::nativeVectorWidthDouble() const
1577 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1578
1579 int Device::nativeVectorWidthHalf() const
1580 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1581
1582 int Device::preferredVectorWidthChar() const
1583 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1584
1585 int Device::preferredVectorWidthShort() const
1586 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1587
1588 int Device::preferredVectorWidthInt() const
1589 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1590
1591 int Device::preferredVectorWidthLong() const
1592 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1593
1594 int Device::preferredVectorWidthFloat() const
1595 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1596
1597 int Device::preferredVectorWidthDouble() const
1598 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1599
1600 int Device::preferredVectorWidthHalf() const
1601 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1602
1603 size_t Device::printfBufferSize() const
1604 #ifdef CL_VERSION_1_2
1605 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1606 #else
1607 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1608 #endif
1609
1610
1611 size_t Device::profilingTimerResolution() const
1612 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1613
1614 const Device& Device::getDefault()
1615 {
1616     const Context& ctx = Context::getDefault();
1617     int idx = getCoreTlsData().device;
1618     const Device& device = ctx.device(idx);
1619     return device;
1620 }
1621
1622 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1623
1624 template <typename Functor, typename ObjectType>
1625 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
1626 {
1627     ::size_t required;
1628     cl_int err = f(obj, name, 0, NULL, &required);
1629     if (err != CL_SUCCESS)
1630         return err;
1631
1632     param.clear();
1633     if (required > 0)
1634     {
1635         AutoBuffer<char> buf(required + 1);
1636         char* ptr = buf.data(); // cleanup is not needed
1637         err = f(obj, name, required, ptr, NULL);
1638         if (err != CL_SUCCESS)
1639             return err;
1640         param = ptr;
1641     }
1642
1643     return CL_SUCCESS;
1644 }
1645
1646 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
1647 {
1648     elems.clear();
1649     if (s.size() == 0)
1650         return;
1651     std::istringstream ss(s);
1652     std::string item;
1653     while (!ss.eof())
1654     {
1655         std::getline(ss, item, delim);
1656         elems.push_back(item);
1657     }
1658 }
1659
1660 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
1661 // Sample: AMD:GPU:
1662 // Sample: AMD:GPU:Tahiti
1663 // Sample: :GPU|CPU: = '' = ':' = '::'
1664 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
1665         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
1666 {
1667     std::vector<std::string> parts;
1668     split(configurationStr, ':', parts);
1669     if (parts.size() > 3)
1670     {
1671         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
1672         return false;
1673     }
1674     if (parts.size() > 2)
1675         deviceNameOrID = parts[2];
1676     if (parts.size() > 1)
1677     {
1678         split(parts[1], '|', deviceTypes);
1679     }
1680     if (parts.size() > 0)
1681     {
1682         platform = parts[0];
1683     }
1684     return true;
1685 }
1686
1687 #if defined WINRT || defined _WIN32_WCE
1688 static cl_device_id selectOpenCLDevice()
1689 {
1690     return NULL;
1691 }
1692 #else
1693 // std::tolower is int->int
1694 static char char_tolower(char ch)
1695 {
1696     return (char)std::tolower((int)ch);
1697 }
1698 static cl_device_id selectOpenCLDevice()
1699 {
1700     std::string platform, deviceName;
1701     std::vector<std::string> deviceTypes;
1702
1703     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
1704     if (configuration &&
1705             (strcmp(configuration, "disabled") == 0 ||
1706              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
1707             ))
1708         return NULL;
1709
1710     bool isID = false;
1711     int deviceID = -1;
1712     if (deviceName.length() == 1)
1713     // We limit ID range to 0..9, because we want to write:
1714     // - '2500' to mean i5-2500
1715     // - '8350' to mean AMD FX-8350
1716     // - '650' to mean GeForce 650
1717     // To extend ID range change condition to '> 0'
1718     {
1719         isID = true;
1720         for (size_t i = 0; i < deviceName.length(); i++)
1721         {
1722             if (!isdigit(deviceName[i]))
1723             {
1724                 isID = false;
1725                 break;
1726             }
1727         }
1728         if (isID)
1729         {
1730             deviceID = atoi(deviceName.c_str());
1731             if (deviceID < 0)
1732                 return NULL;
1733         }
1734     }
1735
1736     std::vector<cl_platform_id> platforms;
1737     {
1738         cl_uint numPlatforms = 0;
1739         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
1740
1741         if (numPlatforms == 0)
1742             return NULL;
1743         platforms.resize((size_t)numPlatforms);
1744         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
1745         platforms.resize(numPlatforms);
1746     }
1747
1748     int selectedPlatform = -1;
1749     if (platform.length() > 0)
1750     {
1751         for (size_t i = 0; i < platforms.size(); i++)
1752         {
1753             std::string name;
1754             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
1755             if (name.find(platform) != std::string::npos)
1756             {
1757                 selectedPlatform = (int)i;
1758                 break;
1759             }
1760         }
1761         if (selectedPlatform == -1)
1762         {
1763             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
1764             goto not_found;
1765         }
1766     }
1767     if (deviceTypes.size() == 0)
1768     {
1769         if (!isID)
1770         {
1771             deviceTypes.push_back("GPU");
1772             if (configuration)
1773                 deviceTypes.push_back("CPU");
1774         }
1775         else
1776             deviceTypes.push_back("ALL");
1777     }
1778     for (size_t t = 0; t < deviceTypes.size(); t++)
1779     {
1780         int deviceType = 0;
1781         std::string tempStrDeviceType = deviceTypes[t];
1782         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
1783
1784         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1785             deviceType = Device::TYPE_GPU;
1786         else if (tempStrDeviceType == "cpu")
1787             deviceType = Device::TYPE_CPU;
1788         else if (tempStrDeviceType == "accelerator")
1789             deviceType = Device::TYPE_ACCELERATOR;
1790         else if (tempStrDeviceType == "all")
1791             deviceType = Device::TYPE_ALL;
1792         else
1793         {
1794             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
1795             goto not_found;
1796         }
1797
1798         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
1799         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
1800                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
1801                 i++)
1802         {
1803             cl_uint count = 0;
1804             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
1805             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1806             {
1807                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
1808             }
1809             if (count == 0)
1810                 continue;
1811             size_t base = devices.size();
1812             devices.resize(base + count);
1813             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
1814             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1815             {
1816                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
1817             }
1818         }
1819
1820         for (size_t i = (isID ? deviceID : 0);
1821              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
1822              i++)
1823         {
1824             std::string name;
1825             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
1826             cl_bool useGPU = true;
1827             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1828             {
1829                 cl_bool isIGPU = CL_FALSE;
1830                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
1831                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
1832             }
1833             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
1834             {
1835                 // TODO check for OpenCL 1.1
1836                 return devices[i];
1837             }
1838         }
1839     }
1840
1841 not_found:
1842     if (!configuration)
1843         return NULL; // suppress messages on stderr
1844
1845     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
1846             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
1847             << "    Device types: ";
1848     for (size_t t = 0; t < deviceTypes.size(); t++)
1849         std::cerr << deviceTypes[t] << " ";
1850
1851     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
1852     return NULL;
1853 }
1854 #endif
1855
1856 #ifdef HAVE_OPENCL_SVM
1857 namespace svm {
1858
1859 enum AllocatorFlags { // don't use first 16 bits
1860         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
1861         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
1862         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
1863         OPENCL_SVM_BUFFER_MASK = 3 << 16,
1864         OPENCL_SVM_BUFFER_MAP = 4 << 16
1865 };
1866
1867 static bool checkForceSVMUmatUsage()
1868 {
1869     static bool initialized = false;
1870     static bool force = false;
1871     if (!initialized)
1872     {
1873         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
1874         initialized = true;
1875     }
1876     return force;
1877 }
1878 static bool checkDisableSVMUMatUsage()
1879 {
1880     static bool initialized = false;
1881     static bool force = false;
1882     if (!initialized)
1883     {
1884         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
1885         initialized = true;
1886     }
1887     return force;
1888 }
1889 static bool checkDisableSVM()
1890 {
1891     static bool initialized = false;
1892     static bool force = false;
1893     if (!initialized)
1894     {
1895         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
1896         initialized = true;
1897     }
1898     return force;
1899 }
1900 // see SVMCapabilities
1901 static unsigned int getSVMCapabilitiesMask()
1902 {
1903     static bool initialized = false;
1904     static unsigned int mask = 0;
1905     if (!initialized)
1906     {
1907         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
1908         if (envValue == NULL)
1909         {
1910             return ~0U; // all bits 1
1911         }
1912         mask = atoi(envValue);
1913         initialized = true;
1914     }
1915     return mask;
1916 }
1917 } // namespace
1918 #endif
1919
1920 static size_t getProgramCountLimit()
1921 {
1922     static bool initialized = false;
1923     static size_t count = 0;
1924     if (!initialized)
1925     {
1926         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
1927         initialized = true;
1928     }
1929     return count;
1930 }
1931
1932 struct Context::Impl
1933 {
1934     static Context::Impl* get(Context& context) { return context.p; }
1935
1936     void __init()
1937     {
1938         refcount = 1;
1939         handle = 0;
1940 #ifdef HAVE_OPENCL_SVM
1941         svmInitialized = false;
1942 #endif
1943     }
1944
1945     Impl()
1946     {
1947         __init();
1948     }
1949
1950     void setDefault()
1951     {
1952         CV_Assert(handle == NULL);
1953
1954         cl_device_id d = selectOpenCLDevice();
1955
1956         if (d == NULL)
1957             return;
1958
1959         cl_platform_id pl = NULL;
1960         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
1961
1962         cl_context_properties prop[] =
1963         {
1964             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1965             0
1966         };
1967
1968         // !!! in the current implementation force the number of devices to 1 !!!
1969         cl_uint nd = 1;
1970         cl_int status;
1971
1972         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
1973         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
1974
1975         bool ok = handle != 0 && status == CL_SUCCESS;
1976         if( ok )
1977         {
1978             devices.resize(nd);
1979             devices[0].set(d);
1980         }
1981         else
1982             handle = NULL;
1983     }
1984
1985     Impl(int dtype0)
1986     {
1987         __init();
1988
1989         cl_int retval = 0;
1990         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
1991         cl_context_properties prop[] =
1992         {
1993             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1994             0
1995         };
1996
1997         cl_uint nd0 = 0;
1998         int dtype = dtype0 & 15;
1999         cl_int status = clGetDeviceIDs(pl, dtype, 0, NULL, &nd0);
2000         if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
2001         {
2002             CV_OCL_DBG_CHECK_RESULT(status,
2003                 cv::format("clGetDeviceIDs(platform=%p, device_type=%d, num_entries=0, devices=NULL, numDevices=%p)", pl, dtype, &nd0).c_str());
2004         }
2005
2006         if (nd0 == 0)
2007             return;
2008
2009         AutoBuffer<void*> dlistbuf(nd0*2+1);
2010         cl_device_id* dlist = (cl_device_id*)dlistbuf.data();
2011         cl_device_id* dlist_new = dlist + nd0;
2012         CV_OCL_DBG_CHECK(clGetDeviceIDs(pl, dtype, nd0, dlist, &nd0));
2013
2014         cl_uint i, nd = 0;
2015         String name0;
2016         for(i = 0; i < nd0; i++)
2017         {
2018             Device d(dlist[i]);
2019             if( !d.available() || !d.compilerAvailable() )
2020                 continue;
2021             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2022                 continue;
2023             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2024                 continue;
2025             String name = d.name();
2026             if( nd != 0 && name != name0 )
2027                 continue;
2028             name0 = name;
2029             dlist_new[nd++] = dlist[i];
2030         }
2031
2032         if(nd == 0)
2033             return;
2034
2035         // !!! in the current implementation force the number of devices to 1 !!!
2036         nd = 1;
2037
2038         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2039         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateContext");
2040         bool ok = handle != 0 && retval == CL_SUCCESS;
2041         if( ok )
2042         {
2043             devices.resize(nd);
2044             for( i = 0; i < nd; i++ )
2045                 devices[i].set(dlist_new[i]);
2046         }
2047     }
2048
2049     ~Impl()
2050     {
2051         if(handle)
2052         {
2053             CV_OCL_DBG_CHECK(clReleaseContext(handle));
2054             handle = NULL;
2055         }
2056         devices.clear();
2057     }
2058
2059     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2060
2061     void unloadProg(Program& prog)
2062     {
2063         cv::AutoLock lock(program_cache_mutex);
2064         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2065         {
2066               phash_t::iterator it = phash.find(*i);
2067               if (it != phash.end())
2068               {
2069                   if (it->second.ptr() == prog.ptr())
2070                   {
2071                       phash.erase(*i);
2072                       cacheList.erase(i);
2073                       return;
2074                   }
2075               }
2076         }
2077     }
2078
2079     std::string& getPrefixString()
2080     {
2081         if (prefix.empty())
2082         {
2083             cv::AutoLock lock(program_cache_mutex);
2084             if (prefix.empty())
2085             {
2086                 CV_Assert(!devices.empty());
2087                 const Device& d = devices[0];
2088                 int bits = d.addressBits();
2089                 if (bits > 0 && bits != 64)
2090                     prefix = cv::format("%d-bit--", bits);
2091                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2092                 // sanitize chars
2093                 for (size_t i = 0; i < prefix.size(); i++)
2094                 {
2095                     char c = prefix[i];
2096                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2097                     {
2098                         prefix[i] = '_';
2099                     }
2100                 }
2101             }
2102         }
2103         return prefix;
2104     }
2105
2106     std::string& getPrefixBase()
2107     {
2108         if (prefix_base.empty())
2109         {
2110             cv::AutoLock lock(program_cache_mutex);
2111             if (prefix_base.empty())
2112             {
2113                 const Device& d = devices[0];
2114                 int bits = d.addressBits();
2115                 if (bits > 0 && bits != 64)
2116                     prefix_base = cv::format("%d-bit--", bits);
2117                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2118                 // sanitize chars
2119                 for (size_t i = 0; i < prefix_base.size(); i++)
2120                 {
2121                     char c = prefix_base[i];
2122                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2123                     {
2124                         prefix_base[i] = '_';
2125                     }
2126                 }
2127             }
2128         }
2129         return prefix_base;
2130     }
2131
2132     IMPLEMENT_REFCOUNTABLE();
2133
2134     cl_context handle;
2135     std::vector<Device> devices;
2136
2137     std::string prefix;
2138     std::string prefix_base;
2139
2140     cv::Mutex program_cache_mutex;
2141     typedef std::map<std::string, Program> phash_t;
2142     phash_t phash;
2143     typedef std::list<cv::String> CacheList;
2144     CacheList cacheList;
2145
2146 #ifdef HAVE_OPENCL_SVM
2147     bool svmInitialized;
2148     bool svmAvailable;
2149     bool svmEnabled;
2150     svm::SVMCapabilities svmCapabilities;
2151     svm::SVMFunctions svmFunctions;
2152
2153     void svmInit()
2154     {
2155         CV_Assert(handle != NULL);
2156         const Device& device = devices[0];
2157         cl_device_svm_capabilities deviceCaps = 0;
2158         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2159         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2160         if (status != CL_SUCCESS)
2161         {
2162             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2163             goto noSVM;
2164         }
2165         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2166         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2167         svmCapabilities.value_ =
2168                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2169                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2170                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2171                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2172         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2173         if (svmCapabilities.value_ == 0)
2174         {
2175             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2176             goto noSVM;
2177         }
2178         try
2179         {
2180             // Try OpenCL 2.0
2181             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2182             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2183             if (!ptr)
2184             {
2185                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2186                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2187             }
2188             try
2189             {
2190                 bool error = false;
2191                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2192                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2193                 {
2194                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2195                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2196                 }
2197                 clFinish(q);
2198                 try
2199                 {
2200                     ((int*)ptr)[0] = 100;
2201                 }
2202                 catch (...)
2203                 {
2204                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2205                     error = true;
2206                 }
2207                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2208                 {
2209                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2210                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2211                 }
2212                 clFinish(q);
2213                 if (error)
2214                 {
2215                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2216                 }
2217             }
2218             catch (...)
2219             {
2220                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2221                 clSVMFree(handle, ptr);
2222                 throw;
2223             }
2224             clSVMFree(handle, ptr);
2225             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2226             svmFunctions.fn_clSVMFree = clSVMFree;
2227             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2228             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2229             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2230             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2231             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2232             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2233             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2234         }
2235         catch (...)
2236         {
2237             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2238             try
2239             {
2240                 // Try HSA extension
2241                 String extensions = device.extensions();
2242                 if (extensions.find("cl_amd_svm") == String::npos)
2243                 {
2244                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2245                     goto noSVM;
2246                 }
2247                 cl_platform_id p = NULL;
2248                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2249                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2250                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2251                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2252                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2253                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2254                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2255                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2256                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2257                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2258                 CV_Assert(svmFunctions.isValid());
2259             }
2260             catch (...)
2261             {
2262                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2263                 goto noSVM;
2264             }
2265         }
2266
2267         svmAvailable = true;
2268         svmEnabled = !svm::checkDisableSVM();
2269         svmInitialized = true;
2270         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2271         return;
2272     noSVM:
2273         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2274         svmAvailable = false;
2275         svmEnabled = false;
2276         svmCapabilities.value_ = 0;
2277         svmInitialized = true;
2278         svmFunctions.fn_clSVMAlloc = NULL;
2279         return;
2280     }
2281 #endif
2282
2283     friend class Program;
2284 };
2285
2286
2287 Context::Context()
2288 {
2289     p = 0;
2290 }
2291
2292 Context::Context(int dtype)
2293 {
2294     p = 0;
2295     create(dtype);
2296 }
2297
2298 bool Context::create()
2299 {
2300     if( !haveOpenCL() )
2301         return false;
2302     if(p)
2303         p->release();
2304     p = new Impl();
2305     if(!p->handle)
2306     {
2307         delete p;
2308         p = 0;
2309     }
2310     return p != 0;
2311 }
2312
2313 bool Context::create(int dtype0)
2314 {
2315     if( !haveOpenCL() )
2316         return false;
2317     if(p)
2318         p->release();
2319     p = new Impl(dtype0);
2320     if(!p->handle)
2321     {
2322         delete p;
2323         p = 0;
2324     }
2325     return p != 0;
2326 }
2327
2328 Context::~Context()
2329 {
2330     if (p)
2331     {
2332         p->release();
2333         p = NULL;
2334     }
2335 }
2336
2337 Context::Context(const Context& c)
2338 {
2339     p = (Impl*)c.p;
2340     if(p)
2341         p->addref();
2342 }
2343
2344 Context& Context::operator = (const Context& c)
2345 {
2346     Impl* newp = (Impl*)c.p;
2347     if(newp)
2348         newp->addref();
2349     if(p)
2350         p->release();
2351     p = newp;
2352     return *this;
2353 }
2354
2355 void* Context::ptr() const
2356 {
2357     return p == NULL ? NULL : p->handle;
2358 }
2359
2360 size_t Context::ndevices() const
2361 {
2362     return p ? p->devices.size() : 0;
2363 }
2364
2365 const Device& Context::device(size_t idx) const
2366 {
2367     static Device dummy;
2368     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2369 }
2370
2371 Context& Context::getDefault(bool initialize)
2372 {
2373     static Context* ctx = new Context();
2374     if(!ctx->p && haveOpenCL())
2375     {
2376         if (!ctx->p)
2377             ctx->p = new Impl();
2378         if (initialize)
2379         {
2380             // do not create new Context right away.
2381             // First, try to retrieve existing context of the same type.
2382             // In its turn, Platform::getContext() may call Context::create()
2383             // if there is no such context.
2384             if (ctx->p->handle == NULL)
2385                 ctx->p->setDefault();
2386         }
2387     }
2388
2389     return *ctx;
2390 }
2391
2392 Program Context::getProg(const ProgramSource& prog,
2393                          const String& buildopts, String& errmsg)
2394 {
2395     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2396 }
2397
2398 void Context::unloadProg(Program& prog)
2399 {
2400     if (p)
2401         p->unloadProg(prog);
2402 }
2403
2404 #ifdef HAVE_OPENCL_SVM
2405 bool Context::useSVM() const
2406 {
2407     Context::Impl* i = p;
2408     CV_Assert(i);
2409     if (!i->svmInitialized)
2410         i->svmInit();
2411     return i->svmEnabled;
2412 }
2413 void Context::setUseSVM(bool enabled)
2414 {
2415     Context::Impl* i = p;
2416     CV_Assert(i);
2417     if (!i->svmInitialized)
2418         i->svmInit();
2419     if (enabled && !i->svmAvailable)
2420     {
2421         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2422     }
2423     i->svmEnabled = enabled;
2424 }
2425 #else
2426 bool Context::useSVM() const { return false; }
2427 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2428 #endif
2429
2430 #ifdef HAVE_OPENCL_SVM
2431 namespace svm {
2432
2433 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2434 {
2435     Context::Impl* i = context.p;
2436     CV_Assert(i);
2437     if (!i->svmInitialized)
2438         i->svmInit();
2439     return i->svmCapabilities;
2440 }
2441
2442 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2443 {
2444     Context::Impl* i = context.p;
2445     CV_Assert(i);
2446     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2447     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2448     return &i->svmFunctions;
2449 }
2450
2451 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2452 {
2453     if (checkForceSVMUmatUsage())
2454         return true;
2455     if (checkDisableSVMUMatUsage())
2456         return false;
2457     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2458         return true;
2459     return false; // don't use SVM by default
2460 }
2461
2462 } // namespace cv::ocl::svm
2463 #endif // HAVE_OPENCL_SVM
2464
2465
2466 static void get_platform_name(cl_platform_id id, String& name)
2467 {
2468     // get platform name string length
2469     size_t sz = 0;
2470     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
2471
2472     // get platform name string
2473     AutoBuffer<char> buf(sz + 1);
2474     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
2475
2476     // just in case, ensure trailing zero for ASCIIZ string
2477     buf[sz] = 0;
2478
2479     name = buf.data();
2480 }
2481
2482 /*
2483 // Attaches OpenCL context to OpenCV
2484 */
2485 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
2486 {
2487     cl_uint cnt = 0;
2488
2489     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
2490
2491     if (cnt == 0)
2492         CV_Error(cv::Error::OpenCLApiCallError, "no OpenCL platform available!");
2493
2494     std::vector<cl_platform_id> platforms(cnt);
2495
2496     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
2497
2498     bool platformAvailable = false;
2499
2500     // check if external platformName contained in list of available platforms in OpenCV
2501     for (unsigned int i = 0; i < cnt; i++)
2502     {
2503         String availablePlatformName;
2504         get_platform_name(platforms[i], availablePlatformName);
2505         // external platform is found in the list of available platforms
2506         if (platformName == availablePlatformName)
2507         {
2508             platformAvailable = true;
2509             break;
2510         }
2511     }
2512
2513     if (!platformAvailable)
2514         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2515
2516     // check if platformID corresponds to platformName
2517     String actualPlatformName;
2518     get_platform_name((cl_platform_id)platformID, actualPlatformName);
2519     if (platformName != actualPlatformName)
2520         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2521
2522     // do not initialize OpenCL context
2523     Context ctx = Context::getDefault(false);
2524
2525     // attach supplied context to OpenCV
2526     initializeContextFromHandle(ctx, platformID, context, deviceID);
2527
2528     CV_OCL_CHECK(clRetainContext((cl_context)context));
2529
2530     // clear command queue, if any
2531     CoreTLSData& data = getCoreTlsData();
2532     data.oclQueue.finish();
2533     Queue q;
2534     data.oclQueue = q;
2535
2536     return;
2537 } // attachContext()
2538
2539
2540 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2541 {
2542     cl_context context = (cl_context)_context;
2543     cl_device_id device = (cl_device_id)_device;
2544
2545     // cleanup old context
2546     Context::Impl * impl = ctx.p;
2547     if (impl->handle)
2548     {
2549         CV_OCL_DBG_CHECK(clReleaseContext(impl->handle));
2550     }
2551     impl->devices.clear();
2552
2553     impl->handle = context;
2554     impl->devices.resize(1);
2555     impl->devices[0].set(device);
2556
2557     Platform& p = Platform::getDefault();
2558     Platform::Impl* pImpl = p.p;
2559     pImpl->handle = (cl_platform_id)platform;
2560 }
2561
2562 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2563
2564 struct Queue::Impl
2565 {
2566     inline void __init()
2567     {
2568         refcount = 1;
2569         handle = 0;
2570         isProfilingQueue_ = false;
2571     }
2572
2573     Impl(cl_command_queue q)
2574     {
2575         __init();
2576         handle = q;
2577
2578         cl_command_queue_properties props = 0;
2579         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
2580         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
2581     }
2582
2583     Impl(cl_command_queue q, bool isProfilingQueue)
2584     {
2585         __init();
2586         handle = q;
2587         isProfilingQueue_ = isProfilingQueue;
2588     }
2589
2590     Impl(const Context& c, const Device& d, bool withProfiling = false)
2591     {
2592         __init();
2593
2594         const Context* pc = &c;
2595         cl_context ch = (cl_context)pc->ptr();
2596         if( !ch )
2597         {
2598             pc = &Context::getDefault();
2599             ch = (cl_context)pc->ptr();
2600         }
2601         cl_device_id dh = (cl_device_id)d.ptr();
2602         if( !dh )
2603             dh = (cl_device_id)pc->device(0).ptr();
2604         cl_int retval = 0;
2605         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
2606         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
2607         isProfilingQueue_ = withProfiling;
2608     }
2609
2610     ~Impl()
2611     {
2612 #ifdef _WIN32
2613         if (!cv::__termination)
2614 #endif
2615         {
2616             if(handle)
2617             {
2618                 CV_OCL_DBG_CHECK(clFinish(handle));
2619                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
2620                 handle = NULL;
2621             }
2622         }
2623     }
2624
2625     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
2626     {
2627         if (isProfilingQueue_)
2628             return self;
2629
2630         if (profiling_queue_.ptr())
2631             return profiling_queue_;
2632
2633         cl_context ctx = 0;
2634         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
2635
2636         cl_device_id device = 0;
2637         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
2638
2639         cl_int result = CL_SUCCESS;
2640         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
2641         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
2642         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
2643
2644         Queue queue;
2645         queue.p = new Impl(q, true);
2646         profiling_queue_ = queue;
2647
2648         return profiling_queue_;
2649     }
2650
2651     IMPLEMENT_REFCOUNTABLE();
2652
2653     cl_command_queue handle;
2654     bool isProfilingQueue_;
2655     cv::ocl::Queue profiling_queue_;
2656 };
2657
2658 Queue::Queue()
2659 {
2660     p = 0;
2661 }
2662
2663 Queue::Queue(const Context& c, const Device& d)
2664 {
2665     p = 0;
2666     create(c, d);
2667 }
2668
2669 Queue::Queue(const Queue& q)
2670 {
2671     p = q.p;
2672     if(p)
2673         p->addref();
2674 }
2675
2676 Queue& Queue::operator = (const Queue& q)
2677 {
2678     Impl* newp = (Impl*)q.p;
2679     if(newp)
2680         newp->addref();
2681     if(p)
2682         p->release();
2683     p = newp;
2684     return *this;
2685 }
2686
2687 Queue::~Queue()
2688 {
2689     if(p)
2690         p->release();
2691 }
2692
2693 bool Queue::create(const Context& c, const Device& d)
2694 {
2695     if(p)
2696         p->release();
2697     p = new Impl(c, d);
2698     return p->handle != 0;
2699 }
2700
2701 void Queue::finish()
2702 {
2703     if(p && p->handle)
2704     {
2705         CV_OCL_DBG_CHECK(clFinish(p->handle));
2706     }
2707 }
2708
2709 const Queue& Queue::getProfilingQueue() const
2710 {
2711     CV_Assert(p);
2712     return p->getProfilingQueue(*this);
2713 }
2714
2715 void* Queue::ptr() const
2716 {
2717     return p ? p->handle : 0;
2718 }
2719
2720 Queue& Queue::getDefault()
2721 {
2722     Queue& q = getCoreTlsData().oclQueue;
2723     if( !q.p && haveOpenCL() )
2724         q.create(Context::getDefault());
2725     return q;
2726 }
2727
2728 static cl_command_queue getQueue(const Queue& q)
2729 {
2730     cl_command_queue qq = (cl_command_queue)q.ptr();
2731     if(!qq)
2732         qq = (cl_command_queue)Queue::getDefault().ptr();
2733     return qq;
2734 }
2735
2736 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2737
2738 KernelArg::KernelArg()
2739     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2740 {
2741 }
2742
2743 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2744     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2745 {
2746     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
2747 }
2748
2749 KernelArg KernelArg::Constant(const Mat& m)
2750 {
2751     CV_Assert(m.isContinuous());
2752     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
2753 }
2754
2755 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2756
2757 struct Kernel::Impl
2758 {
2759     Impl(const char* kname, const Program& prog) :
2760         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
2761     {
2762         cl_program ph = (cl_program)prog.ptr();
2763         cl_int retval = 0;
2764         name = kname;
2765         if (ph)
2766         {
2767             handle = clCreateKernel(ph, kname, &retval);
2768             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
2769         }
2770         for( int i = 0; i < MAX_ARRS; i++ )
2771             u[i] = 0;
2772         haveTempDstUMats = false;
2773         haveTempSrcUMats = false;
2774     }
2775
2776     void cleanupUMats()
2777     {
2778         for( int i = 0; i < MAX_ARRS; i++ )
2779             if( u[i] )
2780             {
2781                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2782                 {
2783                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
2784                     u[i]->currAllocator->deallocate(u[i]);
2785                 }
2786                 u[i] = 0;
2787             }
2788         nu = 0;
2789         haveTempDstUMats = false;
2790         haveTempSrcUMats = false;
2791     }
2792
2793     void addUMat(const UMat& m, bool dst)
2794     {
2795         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2796         u[nu] = m.u;
2797         CV_XADD(&m.u->urefcount, 1);
2798         nu++;
2799         if(dst && m.u->tempUMat())
2800             haveTempDstUMats = true;
2801         if(m.u->originalUMatData == NULL && m.u->tempUMat())
2802             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
2803     }
2804
2805     void addImage(const Image2D& image)
2806     {
2807         images.push_back(image);
2808     }
2809
2810     void finit(cl_event e)
2811     {
2812         CV_UNUSED(e);
2813         cleanupUMats();
2814         images.clear();
2815         isInProgress = false;
2816         release();
2817     }
2818
2819     bool run(int dims, size_t _globalsize[], size_t _localsize[],
2820             bool sync, int64* timeNS, const Queue& q);
2821
2822     ~Impl()
2823     {
2824         if(handle)
2825         {
2826             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
2827         }
2828     }
2829
2830     IMPLEMENT_REFCOUNTABLE();
2831
2832     cv::String name;
2833     cl_kernel handle;
2834     enum { MAX_ARRS = 16 };
2835     UMatData* u[MAX_ARRS];
2836     bool isInProgress;
2837     bool isAsyncRun;  // true if kernel was scheduled in async mode
2838     int nu;
2839     std::list<Image2D> images;
2840     bool haveTempDstUMats;
2841     bool haveTempSrcUMats;
2842 };
2843
2844 }} // namespace cv::ocl
2845
2846 extern "C" {
2847
2848 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
2849 {
2850     try
2851     {
2852         ((cv::ocl::Kernel::Impl*)p)->finit(e);
2853     }
2854     catch (const cv::Exception& exc)
2855     {
2856         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
2857     }
2858     catch (const std::exception& exc)
2859     {
2860         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
2861     }
2862     catch (...)
2863     {
2864         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
2865     }
2866 }
2867
2868 }
2869
2870 namespace cv { namespace ocl {
2871
2872 Kernel::Kernel()
2873 {
2874     p = 0;
2875 }
2876
2877 Kernel::Kernel(const char* kname, const Program& prog)
2878 {
2879     p = 0;
2880     create(kname, prog);
2881 }
2882
2883 Kernel::Kernel(const char* kname, const ProgramSource& src,
2884                const String& buildopts, String* errmsg)
2885 {
2886     p = 0;
2887     create(kname, src, buildopts, errmsg);
2888 }
2889
2890 Kernel::Kernel(const Kernel& k)
2891 {
2892     p = k.p;
2893     if(p)
2894         p->addref();
2895 }
2896
2897 Kernel& Kernel::operator = (const Kernel& k)
2898 {
2899     Impl* newp = (Impl*)k.p;
2900     if(newp)
2901         newp->addref();
2902     if(p)
2903         p->release();
2904     p = newp;
2905     return *this;
2906 }
2907
2908 Kernel::~Kernel()
2909 {
2910     if(p)
2911         p->release();
2912 }
2913
2914 bool Kernel::create(const char* kname, const Program& prog)
2915 {
2916     if(p)
2917         p->release();
2918     p = new Impl(kname, prog);
2919     if(p->handle == 0)
2920     {
2921         p->release();
2922         p = 0;
2923     }
2924 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
2925     CV_Assert(p);
2926 #endif
2927     return p != 0;
2928 }
2929
2930 bool Kernel::create(const char* kname, const ProgramSource& src,
2931                     const String& buildopts, String* errmsg)
2932 {
2933     if(p)
2934     {
2935         p->release();
2936         p = 0;
2937     }
2938     String tempmsg;
2939     if( !errmsg ) errmsg = &tempmsg;
2940     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2941     return create(kname, prog);
2942 }
2943
2944 void* Kernel::ptr() const
2945 {
2946     return p ? p->handle : 0;
2947 }
2948
2949 bool Kernel::empty() const
2950 {
2951     return ptr() == 0;
2952 }
2953
2954 static cv::String dumpValue(size_t sz, const void* p)
2955 {
2956     if (sz == 4)
2957         return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
2958     if (sz == 8)
2959         return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p);
2960     return cv::format("%p", p);
2961 }
2962
2963 int Kernel::set(int i, const void* value, size_t sz)
2964 {
2965     if (!p || !p->handle)
2966         return -1;
2967     if (i < 0)
2968         return i;
2969     if( i == 0 )
2970         p->cleanupUMats();
2971
2972     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2973     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str());
2974     if (retval != CL_SUCCESS)
2975         return -1;
2976     return i+1;
2977 }
2978
2979 int Kernel::set(int i, const Image2D& image2D)
2980 {
2981     p->addImage(image2D);
2982     cl_mem h = (cl_mem)image2D.ptr();
2983     return set(i, &h, sizeof(h));
2984 }
2985
2986 int Kernel::set(int i, const UMat& m)
2987 {
2988     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
2989 }
2990
2991 int Kernel::set(int i, const KernelArg& arg)
2992 {
2993     if( !p || !p->handle )
2994         return -1;
2995     if (i < 0)
2996     {
2997         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
2998                 p->name.c_str(), (int)i));
2999         return i;
3000     }
3001     if( i == 0 )
3002         p->cleanupUMats();
3003     cl_int status = 0;
3004     if( arg.m )
3005     {
3006         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
3007                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
3008         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3009         if (ptronly && arg.m->empty())
3010         {
3011             cl_mem h_null = (cl_mem)NULL;
3012             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3013             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3014             return i + 1;
3015         }
3016         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3017
3018         if (!h)
3019         {
3020             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3021                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3022             p->release();
3023             p = 0;
3024             return -1;
3025         }
3026
3027 #ifdef HAVE_OPENCL_SVM
3028         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3029         {
3030             const Context& ctx = Context::getDefault();
3031             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3032             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3033             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3034 #if 1 // TODO
3035             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3036 #else
3037             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3038 #endif
3039             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3040         }
3041         else
3042 #endif
3043         {
3044             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3045             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3046         }
3047
3048         if (ptronly)
3049         {
3050             i++;
3051         }
3052         else if( arg.m->dims <= 2 )
3053         {
3054             UMat2D u2d(*arg.m);
3055             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3056             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3057             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3058             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3059             i += 3;
3060
3061             if( !(arg.flags & KernelArg::NO_SIZE) )
3062             {
3063                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3064                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3065                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3066                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3067                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3068                 i += 2;
3069             }
3070         }
3071         else
3072         {
3073             UMat3D u3d(*arg.m);
3074             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3075             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3076             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3077             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3078             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3079             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3080             i += 4;
3081             if( !(arg.flags & KernelArg::NO_SIZE) )
3082             {
3083                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3084                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3085                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3086                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3087                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3088                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3089                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3090                 i += 3;
3091             }
3092         }
3093         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3094         return i;
3095     }
3096     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3097     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3098     return i+1;
3099 }
3100
3101 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3102                  bool sync, const Queue& q)
3103 {
3104     if (!p)
3105         return false;
3106
3107     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3108     size_t total = 1;
3109     CV_Assert(_globalsize != NULL);
3110     for (int i = 0; i < dims; i++)
3111     {
3112         size_t val = _localsize ? _localsize[i] :
3113             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3114         CV_Assert( val > 0 );
3115         total *= _globalsize[i];
3116         if (_globalsize[i] == 1 && !_localsize)
3117             val = 1;
3118         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3119     }
3120     CV_Assert(total > 0);
3121
3122     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3123 }
3124
3125
3126 static bool isRaiseErrorOnReuseAsyncKernel()
3127 {
3128     static bool initialized = false;
3129     static bool value = false;
3130     if (!initialized)
3131     {
3132         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3133         initialized = true;
3134     }
3135     return value;
3136 }
3137
3138 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3139         bool sync, int64* timeNS, const Queue& q)
3140 {
3141     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3142
3143     if (!handle)
3144     {
3145         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3146         return false;
3147     }
3148
3149     if (isAsyncRun)
3150     {
3151         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3152         if (isRaiseErrorOnReuseAsyncKernel())
3153             CV_Assert(0);
3154         return false;  // OpenCV 5.0: raise error
3155     }
3156     isAsyncRun = !sync;
3157
3158     if (isInProgress)
3159     {
3160         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3161         if (isRaiseErrorOnReuseAsyncKernel())
3162             CV_Assert(0);
3163         return false;  // OpenCV 5.0: raise error
3164     }
3165
3166     cl_command_queue qq = getQueue(q);
3167     if (haveTempDstUMats)
3168         sync = true;
3169     if (haveTempSrcUMats)
3170         sync = true;
3171     if (timeNS)
3172         sync = true;
3173     cl_event asyncEvent = 0;
3174     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3175                                            NULL, globalsize, localsize, 0, 0,
3176                                            (sync && !timeNS) ? 0 : &asyncEvent);
3177 #if !CV_OPENCL_SHOW_RUN_KERNELS
3178     if (retval != CL_SUCCESS)
3179 #endif
3180     {
3181         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%dx%dx%d, localsize=%s) sync=%s", name.c_str(), (int)dims,
3182                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3183                         (localsize ? cv::format("%dx%dx%d", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3184                         sync ? "true" : "false"
3185                         );
3186         if (retval != CL_SUCCESS)
3187         {
3188             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3189         }
3190 #if CV_OPENCL_TRACE_CHECK
3191         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3192 #else
3193         printf("%s\n", msg.c_str());
3194         fflush(stdout);
3195 #endif
3196     }
3197     if (sync || retval != CL_SUCCESS)
3198     {
3199         CV_OCL_DBG_CHECK(clFinish(qq));
3200         if (timeNS)
3201         {
3202             if (retval == CL_SUCCESS)
3203             {
3204                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3205                 cl_ulong startTime, stopTime;
3206                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3207                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3208                 *timeNS = (int64)(stopTime - startTime);
3209             }
3210             else
3211             {
3212                 *timeNS = -1;
3213             }
3214         }
3215         cleanupUMats();
3216     }
3217     else
3218     {
3219         addref();
3220         isInProgress = true;
3221         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3222     }
3223     if (asyncEvent)
3224         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3225     return retval == CL_SUCCESS;
3226 }
3227
3228 bool Kernel::runTask(bool sync, const Queue& q)
3229 {
3230     if(!p || !p->handle || p->isInProgress)
3231         return false;
3232
3233     cl_command_queue qq = getQueue(q);
3234     cl_event asyncEvent = 0;
3235     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3236     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3237     if (sync || retval != CL_SUCCESS)
3238     {
3239         CV_OCL_DBG_CHECK(clFinish(qq));
3240         p->cleanupUMats();
3241     }
3242     else
3243     {
3244         p->addref();
3245         p->isInProgress = true;
3246         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3247     }
3248     if (asyncEvent)
3249         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3250     return retval == CL_SUCCESS;
3251 }
3252
3253 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3254 {
3255     CV_Assert(p && p->handle && !p->isInProgress);
3256     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3257     CV_Assert(q.ptr());
3258     q.finish(); // call clFinish() on base queue
3259     Queue profilingQueue = q.getProfilingQueue();
3260     int64 timeNs = -1;
3261     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3262     return res ? timeNs : -1;
3263 }
3264
3265 size_t Kernel::workGroupSize() const
3266 {
3267     if(!p || !p->handle)
3268         return 0;
3269     size_t val = 0, retsz = 0;
3270     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3271     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3272     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3273     return status == CL_SUCCESS ? val : 0;
3274 }
3275
3276 size_t Kernel::preferedWorkGroupSizeMultiple() const
3277 {
3278     if(!p || !p->handle)
3279         return 0;
3280     size_t val = 0, retsz = 0;
3281     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3282     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3283     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3284     return status == CL_SUCCESS ? val : 0;
3285 }
3286
3287 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3288 {
3289     if(!p || !p->handle || !wsz)
3290         return 0;
3291     size_t retsz = 0;
3292     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3293     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3294     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3295     return status == CL_SUCCESS;
3296 }
3297
3298 size_t Kernel::localMemSize() const
3299 {
3300     if(!p || !p->handle)
3301         return 0;
3302     size_t retsz = 0;
3303     cl_ulong val = 0;
3304     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3305     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3306     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3307     return status == CL_SUCCESS ? (size_t)val : 0;
3308 }
3309
3310
3311
3312 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3313
3314 struct ProgramSource::Impl
3315 {
3316     IMPLEMENT_REFCOUNTABLE();
3317
3318     enum KIND {
3319         PROGRAM_SOURCE_CODE = 0,
3320         PROGRAM_BINARIES,
3321         PROGRAM_SPIR,
3322         PROGRAM_SPIRV
3323     } kind_;
3324
3325     Impl(const String& src)
3326     {
3327         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3328         initFromSource(src, cv::String());
3329     }
3330     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3331     {
3332         init(PROGRAM_SOURCE_CODE, module, name);
3333         initFromSource(codeStr, codeHash);
3334     }
3335
3336     /// reset fields
3337     void init(enum KIND kind, const String& module, const String& name)
3338     {
3339         refcount = 1;
3340         kind_ = kind;
3341         module_ = module;
3342         name_ = name;
3343
3344         sourceAddr_ = NULL;
3345         sourceSize_ = 0;
3346         isHashUpdated = false;
3347     }
3348
3349     void initFromSource(const String& codeStr, const String& codeHash)
3350     {
3351         codeStr_ = codeStr;
3352         sourceHash_ = codeHash;
3353         if (sourceHash_.empty())
3354         {
3355             updateHash();
3356         }
3357         else
3358         {
3359             isHashUpdated = true;
3360         }
3361     }
3362
3363     void updateHash(const char* hashStr = NULL)
3364     {
3365         if (hashStr)
3366         {
3367             sourceHash_ = cv::String(hashStr);
3368             isHashUpdated = true;
3369             return;
3370         }
3371         uint64 hash = 0;
3372         switch (kind_)
3373         {
3374         case PROGRAM_SOURCE_CODE:
3375             if (sourceAddr_)
3376             {
3377                 CV_Assert(codeStr_.empty());
3378                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3379             }
3380             else
3381             {
3382                 CV_Assert(!codeStr_.empty());
3383                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3384             }
3385             break;
3386         case PROGRAM_BINARIES:
3387         case PROGRAM_SPIR:
3388         case PROGRAM_SPIRV:
3389             hash = crc64(sourceAddr_, sourceSize_);
3390             break;
3391         default:
3392             CV_Error(Error::StsInternal, "Internal error");
3393         }
3394         sourceHash_ = cv::format("%08llx", hash);
3395         isHashUpdated = true;
3396     }
3397
3398     Impl(enum KIND kind,
3399             const String& module, const String& name,
3400             const unsigned char* binary, const size_t size,
3401             const cv::String& buildOptions = cv::String())
3402     {
3403         init(kind, module, name);
3404
3405         sourceAddr_ = binary;
3406         sourceSize_ = size;
3407
3408         buildOptions_ = buildOptions;
3409     }
3410
3411     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3412             const char* sourceCodeStaticStr, const char* hashStaticStr,
3413             const cv::String& buildOptions)
3414     {
3415         ProgramSource result;
3416         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3417                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3418         result.p->updateHash(hashStaticStr);
3419         return result;
3420     }
3421
3422     static ProgramSource fromBinary(const String& module, const String& name,
3423             const unsigned char* binary, const size_t size,
3424             const cv::String& buildOptions)
3425     {
3426         ProgramSource result;
3427         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3428         return result;
3429     }
3430
3431     static ProgramSource fromSPIR(const String& module, const String& name,
3432             const unsigned char* binary, const size_t size,
3433             const cv::String& buildOptions)
3434     {
3435         ProgramSource result;
3436         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3437         return result;
3438     }
3439
3440     String module_;
3441     String name_;
3442
3443     // TODO std::vector<ProgramSource> includes_;
3444     String codeStr_; // PROGRAM_SOURCE_CODE only
3445
3446     const unsigned char* sourceAddr_;
3447     size_t sourceSize_;
3448
3449     cv::String buildOptions_;
3450
3451     String sourceHash_;
3452     bool isHashUpdated;
3453
3454     friend struct Program::Impl;
3455     friend struct internal::ProgramEntry;
3456     friend struct Context::Impl;
3457 };
3458
3459
3460 ProgramSource::ProgramSource()
3461 {
3462     p = 0;
3463 }
3464
3465 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
3466 {
3467     p = new Impl(module, name, codeStr, codeHash);
3468 }
3469
3470 ProgramSource::ProgramSource(const char* prog)
3471 {
3472     p = new Impl(prog);
3473 }
3474
3475 ProgramSource::ProgramSource(const String& prog)
3476 {
3477     p = new Impl(prog);
3478 }
3479
3480 ProgramSource::~ProgramSource()
3481 {
3482     if(p)
3483         p->release();
3484 }
3485
3486 ProgramSource::ProgramSource(const ProgramSource& prog)
3487 {
3488     p = prog.p;
3489     if(p)
3490         p->addref();
3491 }
3492
3493 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3494 {
3495     Impl* newp = (Impl*)prog.p;
3496     if(newp)
3497         newp->addref();
3498     if(p)
3499         p->release();
3500     p = newp;
3501     return *this;
3502 }
3503
3504 const String& ProgramSource::source() const
3505 {
3506     CV_Assert(p);
3507     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
3508     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
3509     return p->codeStr_;
3510 }
3511
3512 ProgramSource::hash_t ProgramSource::hash() const
3513 {
3514     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
3515 }
3516
3517 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
3518         const unsigned char* binary, const size_t size,
3519         const cv::String& buildOptions)
3520 {
3521     CV_Assert(binary);
3522     CV_Assert(size > 0);
3523     return Impl::fromBinary(module, name, binary, size, buildOptions);
3524 }
3525
3526 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
3527         const unsigned char* binary, const size_t size,
3528         const cv::String& buildOptions)
3529 {
3530     CV_Assert(binary);
3531     CV_Assert(size > 0);
3532     return Impl::fromBinary(module, name, binary, size, buildOptions);
3533 }
3534
3535
3536 internal::ProgramEntry::operator ProgramSource&() const
3537 {
3538     if (this->pProgramSource == NULL)
3539     {
3540         cv::AutoLock lock(cv::getInitializationMutex());
3541         if (this->pProgramSource == NULL)
3542         {
3543             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
3544             ProgramSource* ptr = new ProgramSource(ps);
3545             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
3546         }
3547     }
3548     return *this->pProgramSource;
3549 }
3550
3551
3552
3553 /////////////////////////////////////////// Program /////////////////////////////////////////////
3554
3555 static
3556 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
3557 {
3558     if (b.empty())
3559         return a;
3560     if (a.empty())
3561         return b;
3562     if (b[0] == ' ')
3563         return a + b;
3564     return a + (cv::String(" ") + b);
3565 }
3566
3567 struct Program::Impl
3568 {
3569     IMPLEMENT_REFCOUNTABLE();
3570
3571     Impl(const ProgramSource& src,
3572          const String& _buildflags, String& errmsg) :
3573          refcount(1),
3574          handle(NULL),
3575          buildflags(_buildflags)
3576     {
3577         const ProgramSource::Impl* src_ = src.getImpl();
3578         CV_Assert(src_);
3579         sourceModule_ = src_->module_;
3580         sourceName_ = src_->name_;
3581         const Context ctx = Context::getDefault();
3582         Device device = ctx.device(0);
3583         if (ctx.ptr() == NULL || device.ptr() == NULL)
3584             return;
3585         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
3586         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3587         {
3588             if (device.isAMD())
3589                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
3590             else if (device.isIntel())
3591                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
3592             const String param_buildExtraOptions = getBuildExtraOptions();
3593             if (!param_buildExtraOptions.empty())
3594                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
3595         }
3596         compile(ctx, src_, errmsg);
3597     }
3598
3599     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3600     {
3601         CV_Assert(ctx.getImpl());
3602         CV_Assert(src_);
3603
3604         // We don't cache OpenCL binaries
3605         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
3606         {
3607             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3608             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3609             return isLoaded;
3610         }
3611         return compileWithCache(ctx, src_, errmsg);
3612     }
3613
3614     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3615     {
3616         CV_Assert(ctx.getImpl());
3617         CV_Assert(src_);
3618         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
3619
3620 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3621         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
3622         const std::string base_dir = config.prepareCacheDirectoryForContext(
3623                 ctx.getImpl()->getPrefixString(),
3624                 ctx.getImpl()->getPrefixBase()
3625         );
3626         const String& hash_str = src_->sourceHash_;
3627         cv::String fname;
3628         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
3629         {
3630             CV_Assert(!hash_str.empty());
3631             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
3632             fname = utils::fs::join(base_dir, fname);
3633         }
3634         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
3635         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
3636         {
3637             try
3638             {
3639                 std::vector<char> binaryBuf;
3640                 bool res = false;
3641                 {
3642                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3643                     BinaryProgramFile file(fname, hash_str.c_str());
3644                     res = file.read(buildflags, binaryBuf);
3645                 }
3646                 if (res)
3647                 {
3648                     CV_Assert(!binaryBuf.empty());
3649                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
3650                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
3651                     if (isLoaded)
3652                         return true;
3653                 }
3654             }
3655             catch (const cv::Exception& e)
3656             {
3657                 CV_UNUSED(e);
3658                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
3659             }
3660             catch (...)
3661             {
3662                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
3663             }
3664         }
3665 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3666         CV_Assert(handle == NULL);
3667         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3668         {
3669             if (!buildFromSources(ctx, src_, errmsg))
3670             {
3671                 return false;
3672             }
3673         }
3674         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
3675         {
3676             buildflags = joinBuildOptions(buildflags, " -x spir");
3677             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
3678             {
3679                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
3680             }
3681             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3682             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3683             if (!isLoaded)
3684                 return false;
3685         }
3686         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
3687         {
3688             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
3689         }
3690         else
3691         {
3692             CV_Error(Error::StsInternal, "Internal error");
3693         }
3694         CV_Assert(handle != NULL);
3695 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3696         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
3697         {
3698             try
3699             {
3700                 std::vector<char> binaryBuf;
3701                 getProgramBinary(binaryBuf);
3702                 {
3703                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3704                     BinaryProgramFile file(fname, hash_str.c_str());
3705                     file.write(buildflags, binaryBuf);
3706                 }
3707             }
3708             catch (const cv::Exception& e)
3709             {
3710                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
3711             }
3712             catch (...)
3713             {
3714                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
3715             }
3716         }
3717 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3718 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3719         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3720         {
3721             std::vector<char> binaryBuf;
3722             getProgramBinary(binaryBuf);
3723             if (!binaryBuf.empty())
3724             {
3725                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3726                 handle = NULL;
3727                 createFromBinary(ctx, binaryBuf, errmsg);
3728             }
3729         }
3730 #endif
3731         return handle != NULL;
3732     }
3733
3734     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
3735     {
3736         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
3737
3738         size_t retsz = 0;
3739         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3740                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3741         if (log_retval == CL_SUCCESS && retsz > 1)
3742         {
3743             buffer.resize(retsz + 16);
3744             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3745                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
3746             if (log_retval == CL_SUCCESS)
3747             {
3748                 if (retsz < buffer.size())
3749                     buffer[retsz] = 0;
3750                 else
3751                     buffer[buffer.size() - 1] = 0;
3752             }
3753             else
3754             {
3755                 buffer[0] = 0;
3756             }
3757         }
3758
3759         errmsg = String(buffer.data());
3760         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
3761                 sourceModule_.c_str(), sourceName_.c_str(),
3762                 result, getOpenCLErrorString(result),
3763                 buildflags.c_str(), errmsg.c_str());
3764         fflush(stdout);
3765     }
3766
3767     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3768     {
3769         CV_Assert(src_);
3770         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
3771         CV_Assert(handle == NULL);
3772         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
3773                 sourceModule_.c_str(), sourceName_.c_str(),
3774                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
3775
3776         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
3777
3778         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
3779         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
3780         CV_Assert(srcptr != NULL);
3781         CV_Assert(srclen > 0);
3782
3783         cl_int retval = 0;
3784
3785         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3786         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
3787         CV_Assert(handle || retval != CL_SUCCESS);
3788         if (handle && retval == CL_SUCCESS)
3789         {
3790             size_t n = ctx.ndevices();
3791             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
3792             cl_device_id* deviceList = deviceListBuf.data();
3793             for (size_t i = 0; i < n; i++)
3794             {
3795                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
3796             }
3797
3798             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
3799             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
3800 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3801             if (retval != CL_SUCCESS)
3802 #endif
3803             {
3804                 dumpBuildLog_(retval, deviceList, errmsg);
3805
3806                 // don't remove "retval != CL_SUCCESS" condition here:
3807                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
3808                 if (retval != CL_SUCCESS && handle)
3809                 {
3810                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3811                     handle = NULL;
3812                 }
3813             }
3814 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3815             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3816             {
3817                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
3818                 size_t retsz = 0;
3819                 char kernels_buffer[4096] = {0};
3820                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3821                 if (retsz < sizeof(kernels_buffer))
3822                     kernels_buffer[retsz] = 0;
3823                 else
3824                     kernels_buffer[0] = 0;
3825                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3826             }
3827 #endif
3828
3829         }
3830         return handle != NULL;
3831     }
3832
3833     void getProgramBinary(std::vector<char>& buf)
3834     {
3835         CV_Assert(handle);
3836         size_t sz = 0;
3837         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
3838         buf.resize(sz);
3839         uchar* ptr = (uchar*)&buf[0];
3840         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
3841     }
3842
3843     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
3844     {
3845         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
3846     }
3847
3848     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
3849     {
3850         CV_Assert(handle == NULL);
3851         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
3852         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
3853
3854         CV_Assert(binarySize > 0);
3855
3856         size_t ndevices = (int)ctx.ndevices();
3857         AutoBuffer<cl_device_id> devices_(ndevices);
3858         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
3859         AutoBuffer<size_t> binarySizes_(ndevices);
3860
3861         cl_device_id* devices = devices_.data();
3862         const uchar** binaryPtrs = binaryPtrs_.data();
3863         size_t* binarySizes = binarySizes_.data();
3864         for (size_t i = 0; i < ndevices; i++)
3865         {
3866             devices[i] = (cl_device_id)ctx.device(i).ptr();
3867             binaryPtrs[i] = binaryAddr;
3868             binarySizes[i] = binarySize;
3869         }
3870
3871         cl_int result = 0;
3872         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
3873                                            binarySizes, binaryPtrs, NULL, &result);
3874         if (result != CL_SUCCESS)
3875         {
3876             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
3877             if (handle)
3878             {
3879                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3880                 handle = NULL;
3881             }
3882         }
3883         if (!handle)
3884         {
3885             return false;
3886         }
3887         // call clBuildProgram()
3888         {
3889             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
3890             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
3891             if (result != CL_SUCCESS)
3892             {
3893                 dumpBuildLog_(result, devices, errmsg);
3894                 if (handle)
3895                 {
3896                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3897                     handle = NULL;
3898                 }
3899                 return false;
3900             }
3901         }
3902         // check build status
3903         {
3904             cl_build_status build_status = CL_BUILD_NONE;
3905             size_t retsz = 0;
3906             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
3907                     sizeof(build_status), &build_status, &retsz));
3908             if (result == CL_SUCCESS)
3909             {
3910                 if (build_status == CL_BUILD_SUCCESS)
3911                 {
3912                     return true;
3913                 }
3914                 else
3915                 {
3916                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
3917                     return false;
3918                 }
3919             }
3920             else
3921             {
3922                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
3923                 if (handle)
3924                 {
3925                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3926                     handle = NULL;
3927                 }
3928             }
3929         }
3930 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3931         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3932         {
3933             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
3934             size_t retsz = 0;
3935             char kernels_buffer[4096] = {0};
3936             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3937             if (retsz < sizeof(kernels_buffer))
3938                 kernels_buffer[retsz] = 0;
3939             else
3940                 kernels_buffer[0] = 0;
3941             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3942         }
3943 #endif
3944         return handle != NULL;
3945     }
3946
3947     ~Impl()
3948     {
3949         if( handle )
3950         {
3951 #ifdef _WIN32
3952             if (!cv::__termination)
3953 #endif
3954             {
3955                 clReleaseProgram(handle);
3956             }
3957             handle = NULL;
3958         }
3959     }
3960
3961     cl_program handle;
3962
3963     String buildflags;
3964     String sourceModule_;
3965     String sourceName_;
3966 };
3967
3968
3969 Program::Program() { p = 0; }
3970
3971 Program::Program(const ProgramSource& src,
3972         const String& buildflags, String& errmsg)
3973 {
3974     p = 0;
3975     create(src, buildflags, errmsg);
3976 }
3977
3978 Program::Program(const Program& prog)
3979 {
3980     p = prog.p;
3981     if(p)
3982         p->addref();
3983 }
3984
3985 Program& Program::operator = (const Program& prog)
3986 {
3987     Impl* newp = (Impl*)prog.p;
3988     if(newp)
3989         newp->addref();
3990     if(p)
3991         p->release();
3992     p = newp;
3993     return *this;
3994 }
3995
3996 Program::~Program()
3997 {
3998     if(p)
3999         p->release();
4000 }
4001
4002 bool Program::create(const ProgramSource& src,
4003             const String& buildflags, String& errmsg)
4004 {
4005     if(p)
4006     {
4007         p->release();
4008         p = NULL;
4009     }
4010     p = new Impl(src, buildflags, errmsg);
4011     if(!p->handle)
4012     {
4013         p->release();
4014         p = 0;
4015     }
4016     return p != 0;
4017 }
4018
4019 void* Program::ptr() const
4020 {
4021     return p ? p->handle : 0;
4022 }
4023
4024 #ifndef OPENCV_REMOVE_DEPRECATED_API
4025 const ProgramSource& Program::source() const
4026 {
4027     CV_Error(Error::StsNotImplemented, "Removed API");
4028 }
4029
4030 bool Program::read(const String& bin, const String& buildflags)
4031 {
4032     CV_UNUSED(bin); CV_UNUSED(buildflags);
4033     CV_Error(Error::StsNotImplemented, "Removed API");
4034 }
4035
4036 bool Program::write(String& bin) const
4037 {
4038     CV_UNUSED(bin);
4039     CV_Error(Error::StsNotImplemented, "Removed API");
4040 }
4041
4042 String Program::getPrefix() const
4043 {
4044     if(!p)
4045         return String();
4046     Context::Impl* ctx_ = Context::getDefault().getImpl();
4047     CV_Assert(ctx_);
4048     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4049 }
4050
4051 String Program::getPrefix(const String& buildflags)
4052 {
4053         Context::Impl* ctx_ = Context::getDefault().getImpl();
4054         CV_Assert(ctx_);
4055         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4056 }
4057 #endif // OPENCV_REMOVE_DEPRECATED_API
4058
4059 void Program::getBinary(std::vector<char>& binary) const
4060 {
4061     CV_Assert(p && "Empty program");
4062     p->getProgramBinary(binary);
4063 }
4064
4065 Program Context::Impl::getProg(const ProgramSource& src,
4066                                const String& buildflags, String& errmsg)
4067 {
4068     size_t limit = getProgramCountLimit();
4069     const ProgramSource::Impl* src_ = src.getImpl();
4070     CV_Assert(src_);
4071     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4072             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4073             getPrefixString().c_str(),
4074             buildflags.c_str());
4075     {
4076         cv::AutoLock lock(program_cache_mutex);
4077         phash_t::iterator it = phash.find(key);
4078         if (it != phash.end())
4079         {
4080             // TODO LRU cache
4081             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4082             if (i != cacheList.end() && i != cacheList.begin())
4083             {
4084                 cacheList.erase(i);
4085                 cacheList.push_front(key);
4086             }
4087             return it->second;
4088         }
4089         { // cleanup program cache
4090             size_t sz = phash.size();
4091             if (limit > 0 && sz >= limit)
4092             {
4093                 static bool warningFlag = false;
4094                 if (!warningFlag)
4095                 {
4096                     printf("\nWARNING: OpenCV-OpenCL:\n"
4097                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4098                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4099                     warningFlag = true;
4100                 }
4101                 while (!cacheList.empty())
4102                 {
4103                     size_t c = phash.erase(cacheList.back());
4104                     cacheList.pop_back();
4105                     if (c != 0)
4106                         break;
4107                 }
4108             }
4109         }
4110     }
4111     Program prog(src, buildflags, errmsg);
4112     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4113     {
4114         cv::AutoLock lock(program_cache_mutex);
4115         phash.insert(std::pair<std::string, Program>(key, prog));
4116         cacheList.push_front(key);
4117     }
4118     return prog;
4119 }
4120
4121
4122 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4123
4124 template<typename T>
4125 class OpenCLBufferPool
4126 {
4127 protected:
4128     ~OpenCLBufferPool() { }
4129 public:
4130     virtual T allocate(size_t size) = 0;
4131     virtual void release(T buffer) = 0;
4132 };
4133
4134 template <typename Derived, typename BufferEntry, typename T>
4135 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4136 {
4137 private:
4138     inline Derived& derived() { return *static_cast<Derived*>(this); }
4139 protected:
4140     Mutex mutex_;
4141
4142     size_t currentReservedSize;
4143     size_t maxReservedSize;
4144
4145     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4146     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4147
4148     // synchronized
4149     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4150     {
4151         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4152         for (; i != allocatedEntries_.end(); ++i)
4153         {
4154             BufferEntry& e = *i;
4155             if (e.clBuffer_ == buffer)
4156             {
4157                 entry = e;
4158                 allocatedEntries_.erase(i);
4159                 return true;
4160             }
4161         }
4162         return false;
4163     }
4164
4165     // synchronized
4166     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4167     {
4168         if (reservedEntries_.empty())
4169             return false;
4170         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4171         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4172         BufferEntry result;
4173         size_t minDiff = (size_t)(-1);
4174         for (; i != reservedEntries_.end(); ++i)
4175         {
4176             BufferEntry& e = *i;
4177             if (e.capacity_ >= size)
4178             {
4179                 size_t diff = e.capacity_ - size;
4180                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4181                 {
4182                     minDiff = diff;
4183                     result_pos = i;
4184                     result = e;
4185                     if (diff == 0)
4186                         break;
4187                 }
4188             }
4189         }
4190         if (result_pos != reservedEntries_.end())
4191         {
4192             //CV_DbgAssert(result == *result_pos);
4193             reservedEntries_.erase(result_pos);
4194             entry = result;
4195             currentReservedSize -= entry.capacity_;
4196             allocatedEntries_.push_back(entry);
4197             return true;
4198         }
4199         return false;
4200     }
4201
4202     // synchronized
4203     void _checkSizeOfReservedEntries()
4204     {
4205         while (currentReservedSize > maxReservedSize)
4206         {
4207             CV_DbgAssert(!reservedEntries_.empty());
4208             const BufferEntry& entry = reservedEntries_.back();
4209             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4210             currentReservedSize -= entry.capacity_;
4211             derived()._releaseBufferEntry(entry);
4212             reservedEntries_.pop_back();
4213         }
4214     }
4215
4216     inline size_t _allocationGranularity(size_t size)
4217     {
4218         // heuristic values
4219         if (size < 1024*1024)
4220             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4221         else if (size < 16*1024*1024)
4222             return 64*1024;
4223         else
4224             return 1024*1024;
4225     }
4226
4227 public:
4228     OpenCLBufferPoolBaseImpl()
4229         : currentReservedSize(0),
4230           maxReservedSize(0)
4231     {
4232         // nothing
4233     }
4234     virtual ~OpenCLBufferPoolBaseImpl()
4235     {
4236         freeAllReservedBuffers();
4237         CV_Assert(reservedEntries_.empty());
4238     }
4239 public:
4240     virtual T allocate(size_t size) CV_OVERRIDE
4241     {
4242         AutoLock locker(mutex_);
4243         BufferEntry entry;
4244         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4245         {
4246             CV_DbgAssert(size <= entry.capacity_);
4247             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4248         }
4249         else
4250         {
4251             derived()._allocateBufferEntry(entry, size);
4252         }
4253         return entry.clBuffer_;
4254     }
4255     virtual void release(T buffer) CV_OVERRIDE
4256     {
4257         AutoLock locker(mutex_);
4258         BufferEntry entry;
4259         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4260         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4261         {
4262             derived()._releaseBufferEntry(entry);
4263         }
4264         else
4265         {
4266             reservedEntries_.push_front(entry);
4267             currentReservedSize += entry.capacity_;
4268             _checkSizeOfReservedEntries();
4269         }
4270     }
4271
4272     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4273     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4274     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4275     {
4276         AutoLock locker(mutex_);
4277         size_t oldMaxReservedSize = maxReservedSize;
4278         maxReservedSize = size;
4279         if (maxReservedSize < oldMaxReservedSize)
4280         {
4281             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4282             for (; i != reservedEntries_.end();)
4283             {
4284                 const BufferEntry& entry = *i;
4285                 if (entry.capacity_ > maxReservedSize / 8)
4286                 {
4287                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4288                     currentReservedSize -= entry.capacity_;
4289                     derived()._releaseBufferEntry(entry);
4290                     i = reservedEntries_.erase(i);
4291                     continue;
4292                 }
4293                 ++i;
4294             }
4295             _checkSizeOfReservedEntries();
4296         }
4297     }
4298     virtual void freeAllReservedBuffers() CV_OVERRIDE
4299     {
4300         AutoLock locker(mutex_);
4301         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4302         for (; i != reservedEntries_.end(); ++i)
4303         {
4304             const BufferEntry& entry = *i;
4305             derived()._releaseBufferEntry(entry);
4306         }
4307         reservedEntries_.clear();
4308         currentReservedSize = 0;
4309     }
4310 };
4311
4312 struct CLBufferEntry
4313 {
4314     cl_mem clBuffer_;
4315     size_t capacity_;
4316     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4317 };
4318
4319 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4320 {
4321 public:
4322     typedef struct CLBufferEntry BufferEntry;
4323 protected:
4324     int createFlags_;
4325 public:
4326     OpenCLBufferPoolImpl(int createFlags = 0)
4327         : createFlags_(createFlags)
4328     {
4329     }
4330
4331     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4332     {
4333         CV_DbgAssert(entry.clBuffer_ == NULL);
4334         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4335         Context& ctx = Context::getDefault();
4336         cl_int retval = CL_SUCCESS;
4337         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4338         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4339         CV_Assert(entry.clBuffer_ != NULL);
4340         if(retval == CL_SUCCESS)
4341         {
4342             CV_IMPL_ADD(CV_IMPL_OCL);
4343         }
4344         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4345                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4346         allocatedEntries_.push_back(entry);
4347     }
4348
4349     void _releaseBufferEntry(const BufferEntry& entry)
4350     {
4351         CV_Assert(entry.capacity_ != 0);
4352         CV_Assert(entry.clBuffer_ != NULL);
4353         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4354                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4355         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4356     }
4357 };
4358
4359 #ifdef HAVE_OPENCL_SVM
4360 struct CLSVMBufferEntry
4361 {
4362     void* clBuffer_;
4363     size_t capacity_;
4364     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4365 };
4366 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4367 {
4368 public:
4369     typedef struct CLSVMBufferEntry BufferEntry;
4370 public:
4371     OpenCLSVMBufferPoolImpl()
4372     {
4373     }
4374
4375     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4376     {
4377         CV_DbgAssert(entry.clBuffer_ == NULL);
4378         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4379
4380         Context& ctx = Context::getDefault();
4381         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4382         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4383         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4384                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4385
4386         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4387         CV_DbgAssert(svmFns->isValid());
4388
4389         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4390         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4391         CV_Assert(buf);
4392
4393         entry.clBuffer_ = buf;
4394         {
4395             CV_IMPL_ADD(CV_IMPL_OCL);
4396         }
4397         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4398                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4399         allocatedEntries_.push_back(entry);
4400     }
4401
4402     void _releaseBufferEntry(const BufferEntry& entry)
4403     {
4404         CV_Assert(entry.capacity_ != 0);
4405         CV_Assert(entry.clBuffer_ != NULL);
4406         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4407                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4408         Context& ctx = Context::getDefault();
4409         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4410         CV_DbgAssert(svmFns->isValid());
4411         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4412         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4413     }
4414 };
4415 #endif
4416
4417
4418
4419 template <bool readAccess, bool writeAccess>
4420 class AlignedDataPtr
4421 {
4422 protected:
4423     const size_t size_;
4424     uchar* const originPtr_;
4425     const size_t alignment_;
4426     uchar* ptr_;
4427     uchar* allocatedPtr_;
4428
4429 public:
4430     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4431         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4432     {
4433         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4434         CV_DbgAssert(!readAccess || ptr);
4435         if (((size_t)ptr_ & (alignment - 1)) != 0)
4436         {
4437             allocatedPtr_ = new uchar[size_ + alignment - 1];
4438             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4439             if (readAccess)
4440             {
4441                 memcpy(ptr_, originPtr_, size_);
4442             }
4443         }
4444     }
4445
4446     uchar* getAlignedPtr() const
4447     {
4448         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4449         return ptr_;
4450     }
4451
4452     ~AlignedDataPtr()
4453     {
4454         if (allocatedPtr_)
4455         {
4456             if (writeAccess)
4457             {
4458                 memcpy(originPtr_, ptr_, size_);
4459             }
4460             delete[] allocatedPtr_;
4461             allocatedPtr_ = NULL;
4462         }
4463         ptr_ = NULL;
4464     }
4465 private:
4466     AlignedDataPtr(const AlignedDataPtr&); // disabled
4467     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4468 };
4469
4470 template <bool readAccess, bool writeAccess>
4471 class AlignedDataPtr2D
4472 {
4473 protected:
4474     const size_t size_;
4475     uchar* const originPtr_;
4476     const size_t alignment_;
4477     uchar* ptr_;
4478     uchar* allocatedPtr_;
4479     size_t rows_;
4480     size_t cols_;
4481     size_t step_;
4482
4483 public:
4484     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
4485         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
4486     {
4487         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4488         CV_DbgAssert(!readAccess || ptr != NULL);
4489         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
4490         {
4491             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
4492             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4493             if (readAccess)
4494             {
4495                 for (size_t i = 0; i < rows_; i++)
4496                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
4497             }
4498         }
4499     }
4500
4501     uchar* getAlignedPtr() const
4502     {
4503         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4504         return ptr_;
4505     }
4506
4507     ~AlignedDataPtr2D()
4508     {
4509         if (allocatedPtr_)
4510         {
4511             if (writeAccess)
4512             {
4513                 for (size_t i = 0; i < rows_; i++)
4514                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
4515             }
4516             delete[] allocatedPtr_;
4517             allocatedPtr_ = NULL;
4518         }
4519         ptr_ = NULL;
4520     }
4521 private:
4522     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
4523     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
4524 };
4525
4526 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4527 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4528 #endif
4529
4530 class OpenCLAllocator CV_FINAL : public MatAllocator
4531 {
4532     mutable OpenCLBufferPoolImpl bufferPool;
4533     mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4534 #ifdef  HAVE_OPENCL_SVM
4535     mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4536 #endif
4537
4538 public:
4539     enum AllocatorFlags
4540     {
4541         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4542         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
4543 #ifdef HAVE_OPENCL_SVM
4544         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
4545 #endif
4546         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
4547     };
4548
4549     OpenCLAllocator()
4550         : bufferPool(0),
4551           bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4552     {
4553         size_t defaultPoolSize, poolSize;
4554         defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4555         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4556         bufferPool.setMaxReservedSize(poolSize);
4557         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4558         bufferPoolHostPtr.setMaxReservedSize(poolSize);
4559 #ifdef HAVE_OPENCL_SVM
4560         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4561         bufferPoolSVM.setMaxReservedSize(poolSize);
4562 #endif
4563
4564         matStdAllocator = Mat::getDefaultAllocator();
4565     }
4566     ~OpenCLAllocator()
4567     {
4568         flushCleanupQueue();
4569     }
4570
4571     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4572             int flags, UMatUsageFlags usageFlags) const
4573     {
4574         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4575         return u;
4576     }
4577
4578     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
4579     {
4580         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
4581         return value;
4582     }
4583     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
4584     {
4585         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
4586         return value;
4587     }
4588
4589     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4590     {
4591         const Device& dev = ctx.device(0);
4592         createFlags = 0;
4593         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4594             createFlags |= CL_MEM_ALLOC_HOST_PTR;
4595
4596         if (!isOpenCLCopyingForced() &&
4597             (isOpenCLMapForced() ||
4598                 (dev.hostUnifiedMemory()
4599 #ifndef __APPLE__
4600                 || dev.isIntel()
4601 #endif
4602                 )
4603             )
4604         )
4605             flags0 = 0;
4606         else
4607             flags0 = UMatData::COPY_ON_MAP;
4608     }
4609
4610     UMatData* allocate(int dims, const int* sizes, int type,
4611                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4612     {
4613         if(!useOpenCL())
4614             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4615         CV_Assert(data == 0);
4616         size_t total = CV_ELEM_SIZE(type);
4617         for( int i = dims-1; i >= 0; i-- )
4618         {
4619             if( step )
4620                 step[i] = total;
4621             total *= sizes[i];
4622         }
4623
4624         Context& ctx = Context::getDefault();
4625         flushCleanupQueue();
4626
4627         int createFlags = 0, flags0 = 0;
4628         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4629
4630         void* handle = NULL;
4631         int allocatorFlags = 0;
4632
4633 #ifdef HAVE_OPENCL_SVM
4634         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4635         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4636         {
4637             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4638             handle = bufferPoolSVM.allocate(total);
4639
4640             // this property is constant, so single buffer pool can be used here
4641             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4642             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4643         }
4644         else
4645 #endif
4646         if (createFlags == 0)
4647         {
4648             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4649             handle = bufferPool.allocate(total);
4650         }
4651         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4652         {
4653             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4654             handle = bufferPoolHostPtr.allocate(total);
4655         }
4656         else
4657         {
4658             CV_Assert(handle != NULL); // Unsupported, throw
4659         }
4660
4661         if (!handle)
4662             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4663
4664         UMatData* u = new UMatData(this);
4665         u->data = 0;
4666         u->size = total;
4667         u->handle = handle;
4668         u->flags = flags0;
4669         u->allocatorFlags_ = allocatorFlags;
4670         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4671         u->markHostCopyObsolete(true);
4672         opencl_allocator_stats.onAllocate(u->size);
4673         return u;
4674     }
4675
4676     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4677     {
4678         if(!u)
4679             return false;
4680
4681         flushCleanupQueue();
4682
4683         UMatDataAutoLock lock(u);
4684
4685         if(u->handle == 0)
4686         {
4687             CV_Assert(u->origdata != 0);
4688             Context& ctx = Context::getDefault();
4689             int createFlags = 0, flags0 = 0;
4690             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4691
4692             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
4693
4694             cl_context ctx_handle = (cl_context)ctx.ptr();
4695             int allocatorFlags = 0;
4696             int tempUMatFlags = 0;
4697             void* handle = NULL;
4698             cl_int retval = CL_SUCCESS;
4699
4700 #ifdef HAVE_OPENCL_SVM
4701             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4702             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4703             if (useSVM && svmCaps.isSupportFineGrainSystem())
4704             {
4705                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4706                 tempUMatFlags = UMatData::TEMP_UMAT;
4707                 handle = u->origdata;
4708                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4709             }
4710             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4711             {
4712                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
4713                 {
4714                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4715
4716                     cl_svm_mem_flags memFlags = createFlags |
4717                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4718
4719                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4720                     CV_DbgAssert(svmFns->isValid());
4721
4722                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4723                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4724                     CV_Assert(handle);
4725
4726                     cl_command_queue q = NULL;
4727                     if (!isFineGrainBuffer)
4728                     {
4729                         q = (cl_command_queue)Queue::getDefault().ptr();
4730                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4731                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4732                                 handle, u->size,
4733                                 0, NULL, NULL);
4734                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4735
4736                     }
4737                     memcpy(handle, u->origdata, u->size);
4738                     if (!isFineGrainBuffer)
4739                     {
4740                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4741                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4742                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4743                     }
4744
4745                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4746                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4747                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4748                 }
4749             }
4750             else
4751 #endif
4752             {
4753                 if( copyOnMap )
4754                     accessFlags &= ~ACCESS_FAST;
4755
4756                 tempUMatFlags = UMatData::TEMP_UMAT;
4757                 if (
4758                 #ifdef __APPLE__
4759                     !copyOnMap &&
4760                 #endif
4761                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
4762                     // There are OpenCL runtime issues for less aligned data
4763                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
4764                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
4765                     // Avoid sharing of host memory between OpenCL buffers
4766                     && !(u->originalUMatData && u->originalUMatData->handle)
4767                 )
4768                 {
4769                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4770                                             u->size, u->origdata, &retval);
4771                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
4772                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4773                 }
4774                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4775                 {
4776                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4777                                                u->size, u->origdata, &retval);
4778                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
4779                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4780                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4781                 }
4782             }
4783             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
4784             if(!handle || retval != CL_SUCCESS)
4785                 return false;
4786             u->handle = handle;
4787             u->prevAllocator = u->currAllocator;
4788             u->currAllocator = this;
4789             u->flags |= tempUMatFlags | flags0;
4790             u->allocatorFlags_ = allocatorFlags;
4791         }
4792         if(accessFlags & ACCESS_WRITE)
4793             u->markHostCopyObsolete(true);
4794         opencl_allocator_stats.onAllocate(u->size);
4795         return true;
4796     }
4797
4798     /*void sync(UMatData* u) const
4799     {
4800         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4801         UMatDataAutoLock lock(u);
4802
4803         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4804         {
4805             if( u->tempCopiedUMat() )
4806             {
4807                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4808                                     u->size, u->origdata, 0, 0, 0);
4809             }
4810             else
4811             {
4812                 cl_int retval = 0;
4813                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4814                                                 (CL_MAP_READ | CL_MAP_WRITE),
4815                                                 0, u->size, 0, 0, 0, &retval);
4816                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4817                 clFinish(q);
4818             }
4819             u->markHostCopyObsolete(false);
4820         }
4821         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4822         {
4823             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4824                                  u->size, u->data, 0, 0, 0);
4825         }
4826     }*/
4827
4828     void deallocate(UMatData* u) const CV_OVERRIDE
4829     {
4830         if(!u)
4831             return;
4832
4833         CV_Assert(u->urefcount == 0);
4834         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
4835
4836         CV_Assert(u->handle != 0);
4837         CV_Assert(u->mapcount == 0);
4838
4839         if (u->flags & UMatData::ASYNC_CLEANUP)
4840             addToCleanupQueue(u);
4841         else
4842             deallocate_(u);
4843     }
4844
4845     void deallocate_(UMatData* u) const
4846     {
4847         CV_Assert(u);
4848         CV_Assert(u->handle);
4849         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
4850         {
4851             opencl_allocator_stats.onFree(u->size);
4852         }
4853
4854 #ifdef _WIN32
4855         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
4856             return;             // avoid any OpenCL calls
4857 #endif
4858         if(u->tempUMat())
4859         {
4860             CV_Assert(u->origdata);
4861 //            UMatDataAutoLock lock(u);
4862
4863             if (u->hostCopyObsolete())
4864             {
4865 #ifdef HAVE_OPENCL_SVM
4866                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4867                 {
4868                     Context& ctx = Context::getDefault();
4869                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4870                     CV_DbgAssert(svmFns->isValid());
4871
4872                     if( u->tempCopiedUMat() )
4873                     {
4874                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4875                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4876                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4877                         cl_command_queue q = NULL;
4878                         if (!isFineGrainBuffer)
4879                         {
4880                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4881                             q = (cl_command_queue)Queue::getDefault().ptr();
4882                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4883                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4884                                     u->handle, u->size,
4885                                     0, NULL, NULL);
4886                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4887                         }
4888                         clFinish(q);
4889                         memcpy(u->origdata, u->handle, u->size);
4890                         if (!isFineGrainBuffer)
4891                         {
4892                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4893                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4894                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4895                         }
4896                     }
4897                     else
4898                     {
4899                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4900                         // nothing
4901                     }
4902                 }
4903                 else
4904 #endif
4905                 {
4906                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4907                     if( u->tempCopiedUMat() )
4908                     {
4909                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4910                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4911                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
4912                     }
4913                     else
4914                     {
4915                         cl_int retval = 0;
4916                         if (u->tempUMat())
4917                         {
4918                             CV_Assert(u->mapcount == 0);
4919                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
4920                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4921                                 (CL_MAP_READ | CL_MAP_WRITE),
4922                                 0, u->size, 0, 0, 0, &retval);
4923                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
4924                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
4925                             if (u->originalUMatData)
4926                             {
4927                                 CV_Assert(u->originalUMatData->data == data);
4928                             }
4929                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4930                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
4931                             CV_OCL_DBG_CHECK(clFinish(q));
4932                         }
4933                     }
4934                 }
4935                 u->markHostCopyObsolete(false);
4936             }
4937             else
4938             {
4939                 // nothing
4940             }
4941 #ifdef HAVE_OPENCL_SVM
4942             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4943             {
4944                 if( u->tempCopiedUMat() )
4945                 {
4946                     Context& ctx = Context::getDefault();
4947                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4948                     CV_DbgAssert(svmFns->isValid());
4949
4950                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4951                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4952                 }
4953             }
4954             else
4955 #endif
4956             {
4957                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
4958                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
4959             }
4960             u->handle = 0;
4961             u->markDeviceCopyObsolete(true);
4962             u->currAllocator = u->prevAllocator;
4963             u->prevAllocator = NULL;
4964             if(u->data && u->copyOnMap() && u->data != u->origdata)
4965                 fastFree(u->data);
4966             u->data = u->origdata;
4967             u->currAllocator->deallocate(u);
4968             u = NULL;
4969         }
4970         else
4971         {
4972             CV_Assert(u->origdata == NULL);
4973             if(u->data && u->copyOnMap() && u->data != u->origdata)
4974             {
4975                 fastFree(u->data);
4976                 u->data = 0;
4977                 u->markHostCopyObsolete(true);
4978             }
4979             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4980             {
4981                 bufferPool.release((cl_mem)u->handle);
4982             }
4983             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4984             {
4985                 bufferPoolHostPtr.release((cl_mem)u->handle);
4986             }
4987 #ifdef HAVE_OPENCL_SVM
4988             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4989             {
4990                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4991                 {
4992                     //nothing
4993                 }
4994                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4995                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4996                 {
4997                     Context& ctx = Context::getDefault();
4998                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4999                     CV_DbgAssert(svmFns->isValid());
5000                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5001
5002                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5003                     {
5004                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5005                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5006                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5007                     }
5008                 }
5009                 bufferPoolSVM.release((void*)u->handle);
5010             }
5011 #endif
5012             else
5013             {
5014                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5015             }
5016             u->handle = 0;
5017             u->markDeviceCopyObsolete(true);
5018             delete u;
5019             u = NULL;
5020         }
5021         CV_Assert(u == NULL);
5022     }
5023
5024     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5025     void map(UMatData* u, int accessFlags) const CV_OVERRIDE
5026     {
5027         CV_Assert(u && u->handle);
5028
5029         if(accessFlags & ACCESS_WRITE)
5030             u->markDeviceCopyObsolete(true);
5031
5032         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5033
5034         {
5035             if( !u->copyOnMap() )
5036             {
5037                 // TODO
5038                 // because there can be other map requests for the same UMat with different access flags,
5039                 // we use the universal (read-write) access mode.
5040 #ifdef HAVE_OPENCL_SVM
5041                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5042                 {
5043                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5044                     {
5045                         Context& ctx = Context::getDefault();
5046                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5047                         CV_DbgAssert(svmFns->isValid());
5048
5049                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5050                         {
5051                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5052                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5053                                     u->handle, u->size,
5054                                     0, NULL, NULL);
5055                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5056                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5057                         }
5058                     }
5059                     clFinish(q);
5060                     u->data = (uchar*)u->handle;
5061                     u->markHostCopyObsolete(false);
5062                     u->markDeviceMemMapped(true);
5063                     return;
5064                 }
5065 #endif
5066
5067                 cl_int retval = CL_SUCCESS;
5068                 if (!u->deviceMemMapped())
5069                 {
5070                     CV_Assert(u->refcount == 1);
5071                     CV_Assert(u->mapcount++ == 0);
5072                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5073                                                          (CL_MAP_READ | CL_MAP_WRITE),
5074                                                          0, u->size, 0, 0, 0, &retval);
5075                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5076                 }
5077                 if (u->data && retval == CL_SUCCESS)
5078                 {
5079                     u->markHostCopyObsolete(false);
5080                     u->markDeviceMemMapped(true);
5081                     return;
5082                 }
5083
5084                 // TODO Is it really a good idea and was it tested well?
5085                 // if map failed, switch to copy-on-map mode for the particular buffer
5086                 u->flags |= UMatData::COPY_ON_MAP;
5087             }
5088
5089             if(!u->data)
5090             {
5091                 u->data = (uchar*)fastMalloc(u->size);
5092                 u->markHostCopyObsolete(true);
5093             }
5094         }
5095
5096         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
5097         {
5098             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5099 #ifdef HAVE_OPENCL_SVM
5100             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5101 #endif
5102             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5103                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5104             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5105                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5106             u->markHostCopyObsolete(false);
5107         }
5108     }
5109
5110     void unmap(UMatData* u) const CV_OVERRIDE
5111     {
5112         if(!u)
5113             return;
5114
5115
5116         CV_Assert(u->handle != 0);
5117
5118         UMatDataAutoLock autolock(u);
5119
5120         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5121         cl_int retval = 0;
5122         if( !u->copyOnMap() && u->deviceMemMapped() )
5123         {
5124             CV_Assert(u->data != NULL);
5125 #ifdef HAVE_OPENCL_SVM
5126             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5127             {
5128                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5129                 {
5130                     Context& ctx = Context::getDefault();
5131                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5132                     CV_DbgAssert(svmFns->isValid());
5133
5134                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5135                     {
5136                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5137                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5138                                 0, NULL, NULL);
5139                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5140                         clFinish(q);
5141                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5142                     }
5143                 }
5144                 if (u->refcount == 0)
5145                     u->data = 0;
5146                 u->markDeviceCopyObsolete(false);
5147                 u->markHostCopyObsolete(true);
5148                 return;
5149             }
5150 #endif
5151             if (u->refcount == 0)
5152             {
5153                 CV_Assert(u->mapcount-- == 1);
5154                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5155                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5156                 if (Device::getDefault().isAMD())
5157                 {
5158                     // required for multithreaded applications (see stitching test)
5159                     CV_OCL_DBG_CHECK(clFinish(q));
5160                 }
5161                 u->markDeviceMemMapped(false);
5162                 u->data = 0;
5163                 u->markDeviceCopyObsolete(false);
5164                 u->markHostCopyObsolete(true);
5165             }
5166         }
5167         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5168         {
5169             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5170 #ifdef HAVE_OPENCL_SVM
5171             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5172 #endif
5173             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5174                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5175             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5176                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5177             u->markDeviceCopyObsolete(false);
5178             u->markHostCopyObsolete(true);
5179         }
5180     }
5181
5182     bool checkContinuous(int dims, const size_t sz[],
5183                          const size_t srcofs[], const size_t srcstep[],
5184                          const size_t dstofs[], const size_t dststep[],
5185                          size_t& total, size_t new_sz[],
5186                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5187                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5188     {
5189         bool iscontinuous = true;
5190         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5191         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5192         total = sz[dims-1];
5193         for( int i = dims-2; i >= 0; i-- )
5194         {
5195             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5196                 iscontinuous = false;
5197             total *= sz[i];
5198             if( srcofs )
5199                 srcrawofs += srcofs[i]*srcstep[i];
5200             if( dstofs )
5201                 dstrawofs += dstofs[i]*dststep[i];
5202         }
5203
5204         if( !iscontinuous )
5205         {
5206             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5207             if( dims == 2 )
5208             {
5209                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5210                 // we assume that new_... arrays are initialized by caller
5211                 // with 0's, so there is no else branch
5212                 if( srcofs )
5213                 {
5214                     new_srcofs[0] = srcofs[1];
5215                     new_srcofs[1] = srcofs[0];
5216                     new_srcofs[2] = 0;
5217                 }
5218
5219                 if( dstofs )
5220                 {
5221                     new_dstofs[0] = dstofs[1];
5222                     new_dstofs[1] = dstofs[0];
5223                     new_dstofs[2] = 0;
5224                 }
5225
5226                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5227                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5228             }
5229             else
5230             {
5231                 // we could check for dims == 3 here,
5232                 // but from user perspective this one is more informative
5233                 CV_Assert(dims <= 3);
5234                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5235                 if( srcofs )
5236                 {
5237                     new_srcofs[0] = srcofs[2];
5238                     new_srcofs[1] = srcofs[1];
5239                     new_srcofs[2] = srcofs[0];
5240                 }
5241
5242                 if( dstofs )
5243                 {
5244                     new_dstofs[0] = dstofs[2];
5245                     new_dstofs[1] = dstofs[1];
5246                     new_dstofs[2] = dstofs[0];
5247                 }
5248
5249                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5250                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5251             }
5252         }
5253         return iscontinuous;
5254     }
5255
5256     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5257                   const size_t srcofs[], const size_t srcstep[],
5258                   const size_t dststep[]) const CV_OVERRIDE
5259     {
5260         if(!u)
5261             return;
5262         UMatDataAutoLock autolock(u);
5263
5264         if( u->data && !u->hostCopyObsolete() )
5265         {
5266             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5267             return;
5268         }
5269         CV_Assert( u->handle != 0 );
5270
5271         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5272
5273         size_t total = 0, new_sz[] = {0, 0, 0};
5274         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5275         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5276
5277         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5278                                             total, new_sz,
5279                                             srcrawofs, new_srcofs, new_srcstep,
5280                                             dstrawofs, new_dstofs, new_dststep);
5281
5282 #ifdef HAVE_OPENCL_SVM
5283         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5284         {
5285             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5286             Context& ctx = Context::getDefault();
5287             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5288             CV_DbgAssert(svmFns->isValid());
5289
5290             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5291             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5292             {
5293                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5294                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5295                         u->handle, u->size,
5296                         0, NULL, NULL);
5297                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5298             }
5299             clFinish(q);
5300             if( iscontinuous )
5301             {
5302                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5303             }
5304             else
5305             {
5306                 // This code is from MatAllocator::download()
5307                 int isz[CV_MAX_DIM];
5308                 uchar* srcptr = (uchar*)u->handle;
5309                 for( int i = 0; i < dims; i++ )
5310                 {
5311                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5312                     if( sz[i] == 0 )
5313                     return;
5314                     if( srcofs )
5315                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5316                     isz[i] = (int)sz[i];
5317                 }
5318
5319                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5320                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5321
5322                 const Mat* arrays[] = { &src, &dst };
5323                 uchar* ptrs[2];
5324                 NAryMatIterator it(arrays, ptrs, 2);
5325                 size_t j, planesz = it.size;
5326
5327                 for( j = 0; j < it.nplanes; j++, ++it )
5328                     memcpy(ptrs[1], ptrs[0], planesz);
5329             }
5330             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5331             {
5332                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5333                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5334                         0, NULL, NULL);
5335                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5336                 clFinish(q);
5337             }
5338         }
5339         else
5340 #endif
5341         {
5342             if( iscontinuous )
5343             {
5344                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5345                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5346                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5347             }
5348             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5349             {
5350                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5351                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5352                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5353                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5354                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5355                 uchar* ptr = alignedPtr.getAlignedPtr();
5356
5357                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5358                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5359                 total = std::min(total, u->size - new_srcrawofs);
5360                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5361                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5362                 for( size_t i = 0; i < new_sz[1]; i++ )
5363                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5364             }
5365             else
5366             {
5367                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5368                 uchar* ptr = alignedPtr.getAlignedPtr();
5369
5370                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5371                     new_srcofs, new_dstofs, new_sz,
5372                     new_srcstep[0], 0,
5373                     new_dststep[0], 0,
5374                     ptr, 0, 0, 0));
5375             }
5376         }
5377     }
5378
5379     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5380                 const size_t dstofs[], const size_t dststep[],
5381                 const size_t srcstep[]) const CV_OVERRIDE
5382     {
5383         if(!u)
5384             return;
5385
5386         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5387         CV_Assert(u->refcount == 0 || u->tempUMat());
5388
5389         size_t total = 0, new_sz[] = {0, 0, 0};
5390         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5391         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5392
5393         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5394                                             total, new_sz,
5395                                             srcrawofs, new_srcofs, new_srcstep,
5396                                             dstrawofs, new_dstofs, new_dststep);
5397
5398         UMatDataAutoLock autolock(u);
5399
5400         // if there is cached CPU copy of the GPU matrix,
5401         // we could use it as a destination.
5402         // we can do it in 2 cases:
5403         //    1. we overwrite the whole content
5404         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5405         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5406         {
5407             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5408             u->markHostCopyObsolete(false);
5409             u->markDeviceCopyObsolete(true);
5410             return;
5411         }
5412
5413         CV_Assert( u->handle != 0 );
5414         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5415
5416 #ifdef HAVE_OPENCL_SVM
5417         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5418         {
5419             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5420             Context& ctx = Context::getDefault();
5421             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5422             CV_DbgAssert(svmFns->isValid());
5423
5424             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5425             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5426             {
5427                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5428                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5429                         u->handle, u->size,
5430                         0, NULL, NULL);
5431                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5432             }
5433             clFinish(q);
5434             if( iscontinuous )
5435             {
5436                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
5437             }
5438             else
5439             {
5440                 // This code is from MatAllocator::upload()
5441                 int isz[CV_MAX_DIM];
5442                 uchar* dstptr = (uchar*)u->handle;
5443                 for( int i = 0; i < dims; i++ )
5444                 {
5445                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5446                     if( sz[i] == 0 )
5447                     return;
5448                     if( dstofs )
5449                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5450                     isz[i] = (int)sz[i];
5451                 }
5452
5453                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
5454                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5455
5456                 const Mat* arrays[] = { &src, &dst };
5457                 uchar* ptrs[2];
5458                 NAryMatIterator it(arrays, ptrs, 2);
5459                 size_t j, planesz = it.size;
5460
5461                 for( j = 0; j < it.nplanes; j++, ++it )
5462                     memcpy(ptrs[1], ptrs[0], planesz);
5463             }
5464             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5465             {
5466                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5467                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5468                         0, NULL, NULL);
5469                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5470                 clFinish(q);
5471             }
5472         }
5473         else
5474 #endif
5475         {
5476             if( iscontinuous )
5477             {
5478                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5479                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5480                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
5481                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
5482                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5483             }
5484             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5485             {
5486                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5487                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5488                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
5489                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
5490                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5491                 uchar* ptr = alignedPtr.getAlignedPtr();
5492
5493                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5494                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
5495                 total = std::min(total, u->size - new_dstrawofs);
5496                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
5497                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
5498                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
5499                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5500                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5501                 for( size_t i = 0; i < new_sz[1]; i++ )
5502                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
5503                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5504                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5505             }
5506             else
5507             {
5508                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5509                 uchar* ptr = alignedPtr.getAlignedPtr();
5510
5511                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5512                     new_dstofs, new_srcofs, new_sz,
5513                     new_dststep[0], 0,
5514                     new_srcstep[0], 0,
5515                     ptr, 0, 0, 0));
5516             }
5517         }
5518         u->markHostCopyObsolete(true);
5519 #ifdef HAVE_OPENCL_SVM
5520         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5521                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5522         {
5523             // nothing
5524         }
5525         else
5526 #endif
5527         {
5528             u->markHostCopyObsolete(true);
5529         }
5530         u->markDeviceCopyObsolete(false);
5531     }
5532
5533     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
5534               const size_t srcofs[], const size_t srcstep[],
5535               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
5536     {
5537         if(!src || !dst)
5538             return;
5539
5540         size_t total = 0, new_sz[] = {0, 0, 0};
5541         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5542         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5543
5544         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
5545                                             total, new_sz,
5546                                             srcrawofs, new_srcofs, new_srcstep,
5547                                             dstrawofs, new_dstofs, new_dststep);
5548
5549         UMatDataAutoLock src_autolock(src, dst);
5550
5551         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5552         {
5553             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5554             return;
5555         }
5556         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5557         {
5558             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5559             dst->markHostCopyObsolete(false);
5560 #ifdef HAVE_OPENCL_SVM
5561             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5562                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5563             {
5564                 // nothing
5565             }
5566             else
5567 #endif
5568             {
5569                 dst->markDeviceCopyObsolete(true);
5570             }
5571             return;
5572         }
5573
5574         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5575         CV_Assert(dst->refcount == 0);
5576         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5577
5578         cl_int retval = CL_SUCCESS;
5579 #ifdef HAVE_OPENCL_SVM
5580         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5581                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5582         {
5583             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5584                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5585             {
5586                 Context& ctx = Context::getDefault();
5587                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5588                 CV_DbgAssert(svmFns->isValid());
5589
5590                 if( iscontinuous )
5591                 {
5592                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5593                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5594                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5595                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5596                             total, 0, NULL, NULL);
5597                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
5598                 }
5599                 else
5600                 {
5601                     clFinish(q);
5602                     // This code is from MatAllocator::download()/upload()
5603                     int isz[CV_MAX_DIM];
5604                     uchar* srcptr = (uchar*)src->handle;
5605                     for( int i = 0; i < dims; i++ )
5606                     {
5607                         CV_Assert( sz[i] <= (size_t)INT_MAX );
5608                         if( sz[i] == 0 )
5609                         return;
5610                         if( srcofs )
5611                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5612                         isz[i] = (int)sz[i];
5613                     }
5614                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5615
5616                     uchar* dstptr = (uchar*)dst->handle;
5617                     for( int i = 0; i < dims; i++ )
5618                     {
5619                         if( dstofs )
5620                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5621                     }
5622                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5623
5624                     const Mat* arrays[] = { &m_src, &m_dst };
5625                     uchar* ptrs[2];
5626                     NAryMatIterator it(arrays, ptrs, 2);
5627                     size_t j, planesz = it.size;
5628
5629                     for( j = 0; j < it.nplanes; j++, ++it )
5630                         memcpy(ptrs[1], ptrs[0], planesz);
5631                 }
5632             }
5633             else
5634             {
5635                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5636                 {
5637                     map(src, ACCESS_READ);
5638                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5639                     unmap(src);
5640                 }
5641                 else
5642                 {
5643                     map(dst, ACCESS_WRITE);
5644                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5645                     unmap(dst);
5646                 }
5647             }
5648         }
5649         else
5650 #endif
5651         {
5652             if( iscontinuous )
5653             {
5654                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5655                                                srcrawofs, dstrawofs, total, 0, 0, 0);
5656                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
5657                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
5658             }
5659             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5660             {
5661                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5662                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5663                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
5664                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5665                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
5666
5667                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5668                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5669                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
5670                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5671                 uchar* srcptr = srcBuf.getAlignedPtr();
5672                 uchar* dstptr = dstBuf.getAlignedPtr();
5673
5674                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5675
5676                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
5677                 src_total = std::min(src_total, src->size - new_srcrawofs);
5678                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
5679                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
5680
5681                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
5682                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
5683                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5684                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5685
5686                 for( size_t i = 0; i < new_sz[1]; i++ )
5687                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
5688                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
5689                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5690                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5691             }
5692             else
5693             {
5694                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5695                                                    new_srcofs, new_dstofs, new_sz,
5696                                                    new_srcstep[0], 0,
5697                                                    new_dststep[0], 0,
5698                                                    0, 0, 0));
5699             }
5700         }
5701         if (retval == CL_SUCCESS)
5702         {
5703             CV_IMPL_ADD(CV_IMPL_OCL)
5704         }
5705
5706 #ifdef HAVE_OPENCL_SVM
5707         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5708             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5709         {
5710             // nothing
5711         }
5712         else
5713 #endif
5714         {
5715             dst->markHostCopyObsolete(true);
5716         }
5717         dst->markDeviceCopyObsolete(false);
5718
5719         if( _sync )
5720         {
5721             CV_OCL_DBG_CHECK(clFinish(q));
5722         }
5723     }
5724
5725     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE {
5726 #ifdef HAVE_OPENCL_SVM
5727         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5728         {
5729             return &bufferPoolSVM;
5730         }
5731 #endif
5732         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5733         {
5734             return &bufferPoolHostPtr;
5735         }
5736         if (id != NULL && strcmp(id, "OCL") != 0)
5737         {
5738             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5739         }
5740         return &bufferPool;
5741     }
5742
5743     MatAllocator* matStdAllocator;
5744
5745     mutable cv::Mutex cleanupQueueMutex;
5746     mutable std::deque<UMatData*> cleanupQueue;
5747
5748     void flushCleanupQueue() const
5749     {
5750         if (!cleanupQueue.empty())
5751         {
5752             std::deque<UMatData*> q;
5753             {
5754                 cv::AutoLock lock(cleanupQueueMutex);
5755                 q.swap(cleanupQueue);
5756             }
5757             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
5758             {
5759                 deallocate_(*i);
5760             }
5761         }
5762     }
5763     void addToCleanupQueue(UMatData* u) const
5764     {
5765         //TODO: Validation check: CV_Assert(!u->tempUMat());
5766         {
5767             cv::AutoLock lock(cleanupQueueMutex);
5768             cleanupQueue.push_back(u);
5769         }
5770     }
5771 };
5772
5773 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
5774 {
5775     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
5776     g_isOpenCVActivated = true;
5777     return g_allocator;
5778 }
5779 MatAllocator* getOpenCLAllocator()
5780 {
5781     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
5782 }
5783
5784 }} // namespace cv::ocl
5785
5786
5787 namespace cv {
5788
5789 // three funcs below are implemented in umatrix.cpp
5790 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
5791               bool autoSteps = false );
5792 void finalizeHdr(UMat& m);
5793
5794 } // namespace cv
5795
5796
5797 namespace cv { namespace ocl {
5798
5799 /*
5800 // Convert OpenCL buffer memory to UMat
5801 */
5802 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
5803 {
5804     int d = 2;
5805     int sizes[] = { rows, cols };
5806
5807     CV_Assert(0 <= d && d <= CV_MAX_DIM);
5808
5809     dst.release();
5810
5811     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
5812     dst.usageFlags = USAGE_DEFAULT;
5813
5814     setSize(dst, d, sizes, 0, true);
5815     dst.offset = 0;
5816
5817     cl_mem             memobj = (cl_mem)cl_mem_buffer;
5818     cl_mem_object_type mem_type = 0;
5819
5820     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5821
5822     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
5823
5824     size_t total = 0;
5825     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
5826
5827     CV_OCL_CHECK(clRetainMemObject(memobj));
5828
5829     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
5830     CV_Assert(total >= rows * step);
5831
5832     // attach clBuffer to UMatData
5833     dst.u = new UMatData(getOpenCLAllocator());
5834     dst.u->data            = 0;
5835     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
5836     dst.u->flags           = 0;
5837     dst.u->handle          = cl_mem_buffer;
5838     dst.u->origdata        = 0;
5839     dst.u->prevAllocator   = 0;
5840     dst.u->size            = total;
5841
5842     finalizeHdr(dst);
5843     dst.addref();
5844
5845     return;
5846 } // convertFromBuffer()
5847
5848
5849 /*
5850 // Convert OpenCL image2d_t memory to UMat
5851 */
5852 void convertFromImage(void* cl_mem_image, UMat& dst)
5853 {
5854     cl_mem             clImage = (cl_mem)cl_mem_image;
5855     cl_mem_object_type mem_type = 0;
5856
5857     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5858
5859     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
5860
5861     cl_image_format fmt = { 0, 0 };
5862     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
5863
5864     int depth = CV_8U;
5865     switch (fmt.image_channel_data_type)
5866     {
5867     case CL_UNORM_INT8:
5868     case CL_UNSIGNED_INT8:
5869         depth = CV_8U;
5870         break;
5871
5872     case CL_SNORM_INT8:
5873     case CL_SIGNED_INT8:
5874         depth = CV_8S;
5875         break;
5876
5877     case CL_UNORM_INT16:
5878     case CL_UNSIGNED_INT16:
5879         depth = CV_16U;
5880         break;
5881
5882     case CL_SNORM_INT16:
5883     case CL_SIGNED_INT16:
5884         depth = CV_16S;
5885         break;
5886
5887     case CL_SIGNED_INT32:
5888         depth = CV_32S;
5889         break;
5890
5891     case CL_FLOAT:
5892         depth = CV_32F;
5893         break;
5894
5895     default:
5896         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
5897     }
5898
5899     int type = CV_8UC1;
5900     switch (fmt.image_channel_order)
5901     {
5902     case CL_R:
5903         type = CV_MAKE_TYPE(depth, 1);
5904         break;
5905
5906     case CL_RGBA:
5907     case CL_BGRA:
5908     case CL_ARGB:
5909         type = CV_MAKE_TYPE(depth, 4);
5910         break;
5911
5912     default:
5913         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
5914         break;
5915     }
5916
5917     size_t step = 0;
5918     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
5919
5920     size_t w = 0;
5921     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
5922
5923     size_t h = 0;
5924     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
5925
5926     dst.create((int)h, (int)w, type);
5927
5928     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
5929
5930     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5931
5932     size_t offset = 0;
5933     size_t src_origin[3] = { 0, 0, 0 };
5934     size_t region[3] = { w, h, 1 };
5935     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
5936
5937     CV_OCL_CHECK(clFinish(q));
5938
5939     return;
5940 } // convertFromImage()
5941
5942
5943 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5944
5945 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5946 {
5947     cl_uint numDevices = 0;
5948     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
5949     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
5950     {
5951         CV_OCL_DBG_CHECK_RESULT(status,
5952             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
5953     }
5954
5955     if (numDevices == 0)
5956     {
5957         devices.clear();
5958         return;
5959     }
5960
5961     devices.resize((size_t)numDevices);
5962     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
5963 }
5964
5965 struct PlatformInfo::Impl
5966 {
5967     Impl(void* id)
5968     {
5969         refcount = 1;
5970         handle = *(cl_platform_id*)id;
5971         getDevices(devices, handle);
5972
5973         version_ = getStrProp(CL_PLATFORM_VERSION);
5974         parseOpenCLVersion(version_, versionMajor_, versionMinor_);
5975     }
5976
5977     String getStrProp(cl_platform_info prop) const
5978     {
5979         char buf[1024];
5980         size_t sz=0;
5981         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5982             sz < sizeof(buf) ? String(buf) : String();
5983     }
5984
5985     IMPLEMENT_REFCOUNTABLE();
5986     std::vector<cl_device_id> devices;
5987     cl_platform_id handle;
5988
5989     String version_;
5990     int versionMajor_;
5991     int versionMinor_;
5992 };
5993
5994 PlatformInfo::PlatformInfo()
5995 {
5996     p = 0;
5997 }
5998
5999 PlatformInfo::PlatformInfo(void* platform_id)
6000 {
6001     p = new Impl(platform_id);
6002 }
6003
6004 PlatformInfo::~PlatformInfo()
6005 {
6006     if(p)
6007         p->release();
6008 }
6009
6010 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6011 {
6012     if (i.p)
6013         i.p->addref();
6014     p = i.p;
6015 }
6016
6017 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6018 {
6019     if (i.p != p)
6020     {
6021         if (i.p)
6022             i.p->addref();
6023         if (p)
6024             p->release();
6025         p = i.p;
6026     }
6027     return *this;
6028 }
6029
6030 int PlatformInfo::deviceNumber() const
6031 {
6032     return p ? (int)p->devices.size() : 0;
6033 }
6034
6035 void PlatformInfo::getDevice(Device& device, int d) const
6036 {
6037     CV_Assert(p && d < (int)p->devices.size() );
6038     if(p)
6039         device.set(p->devices[d]);
6040 }
6041
6042 String PlatformInfo::name() const
6043 {
6044     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6045 }
6046
6047 String PlatformInfo::vendor() const
6048 {
6049     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6050 }
6051
6052 String PlatformInfo::version() const
6053 {
6054     return p ? p->version_ : String();
6055 }
6056
6057 int PlatformInfo::versionMajor() const
6058 {
6059     CV_Assert(p);
6060     return p->versionMajor_;
6061 }
6062
6063 int PlatformInfo::versionMinor() const
6064 {
6065     CV_Assert(p);
6066     return p->versionMinor_;
6067 }
6068
6069 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6070 {
6071     cl_uint numPlatforms = 0;
6072     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6073
6074     if (numPlatforms == 0)
6075     {
6076         platforms.clear();
6077         return;
6078     }
6079
6080     platforms.resize((size_t)numPlatforms);
6081     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6082 }
6083
6084 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6085 {
6086     std::vector<cl_platform_id> platforms;
6087     getPlatforms(platforms);
6088
6089     for (size_t i = 0; i < platforms.size(); i++)
6090         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6091 }
6092
6093 const char* typeToStr(int type)
6094 {
6095     static const char* tab[]=
6096     {
6097         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6098         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6099         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6100         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6101         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6102         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6103         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6104         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6105     };
6106     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6107     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6108 }
6109
6110 const char* memopTypeToStr(int type)
6111 {
6112     static const char* tab[] =
6113     {
6114         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6115         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6116         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6117         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6118         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6119         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6120         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6121         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6122     };
6123     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6124     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6125 }
6126
6127 const char* vecopTypeToStr(int type)
6128 {
6129     static const char* tab[] =
6130     {
6131         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6132         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6133         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6134         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6135         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6136         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6137         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6138         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6139     };
6140     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6141     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6142 }
6143
6144 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6145 {
6146     if( sdepth == ddepth )
6147         return "noconvert";
6148     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6149     if( ddepth >= CV_32F ||
6150         (ddepth == CV_32S && sdepth < CV_32S) ||
6151         (ddepth == CV_16S && sdepth <= CV_8S) ||
6152         (ddepth == CV_16U && sdepth == CV_8U))
6153     {
6154         sprintf(buf, "convert_%s", typestr);
6155     }
6156     else if( sdepth >= CV_32F )
6157         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6158     else
6159         sprintf(buf, "convert_%s_sat", typestr);
6160
6161     return buf;
6162 }
6163
6164 const char* getOpenCLErrorString(int errorCode)
6165 {
6166 #define CV_OCL_CODE(id) case id: return #id
6167 #define CV_OCL_CODE_(id, name) case id: return #name
6168     switch (errorCode)
6169     {
6170     CV_OCL_CODE(CL_SUCCESS);
6171     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6172     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6173     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6174     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6175     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6176     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6177     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6178     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6179     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6180     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6181     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6182     CV_OCL_CODE(CL_MAP_FAILURE);
6183     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6184     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6185     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6186     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6187     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6188     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6189     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6190     CV_OCL_CODE(CL_INVALID_VALUE);
6191     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6192     CV_OCL_CODE(CL_INVALID_PLATFORM);
6193     CV_OCL_CODE(CL_INVALID_DEVICE);
6194     CV_OCL_CODE(CL_INVALID_CONTEXT);
6195     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6196     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6197     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6198     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6199     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6200     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6201     CV_OCL_CODE(CL_INVALID_SAMPLER);
6202     CV_OCL_CODE(CL_INVALID_BINARY);
6203     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6204     CV_OCL_CODE(CL_INVALID_PROGRAM);
6205     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6206     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6207     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6208     CV_OCL_CODE(CL_INVALID_KERNEL);
6209     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6210     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6211     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6212     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6213     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6214     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6215     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6216     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6217     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6218     CV_OCL_CODE(CL_INVALID_EVENT);
6219     CV_OCL_CODE(CL_INVALID_OPERATION);
6220     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6221     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6222     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6223     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6224     // OpenCL 1.1
6225     CV_OCL_CODE(CL_INVALID_PROPERTY);
6226     // OpenCL 1.2
6227     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6228     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6229     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6230     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6231     // OpenCL 2.0
6232     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6233     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6234     // Extensions
6235     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6236     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6237     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6238     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6239     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6240     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6241     default: return "Unknown OpenCL error";
6242     }
6243 #undef CV_OCL_CODE
6244 #undef CV_OCL_CODE_
6245 }
6246
6247 template <typename T>
6248 static std::string kerToStr(const Mat & k)
6249 {
6250     int width = k.cols - 1, depth = k.depth();
6251     const T * const data = k.ptr<T>();
6252
6253     std::ostringstream stream;
6254     stream.precision(10);
6255
6256     if (depth <= CV_8S)
6257     {
6258         for (int i = 0; i < width; ++i)
6259             stream << "DIG(" << (int)data[i] << ")";
6260         stream << "DIG(" << (int)data[width] << ")";
6261     }
6262     else if (depth == CV_32F)
6263     {
6264         stream.setf(std::ios_base::showpoint);
6265         for (int i = 0; i < width; ++i)
6266             stream << "DIG(" << data[i] << "f)";
6267         stream << "DIG(" << data[width] << "f)";
6268     }
6269     else
6270     {
6271         for (int i = 0; i < width; ++i)
6272             stream << "DIG(" << data[i] << ")";
6273         stream << "DIG(" << data[width] << ")";
6274     }
6275
6276     return stream.str();
6277 }
6278
6279 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6280 {
6281     Mat kernel = _kernel.getMat().reshape(1, 1);
6282
6283     int depth = kernel.depth();
6284     if (ddepth < 0)
6285         ddepth = depth;
6286
6287     if (ddepth != depth)
6288         kernel.convertTo(kernel, ddepth);
6289
6290     typedef std::string (* func_t)(const Mat &);
6291     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6292                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6293     const func_t func = funcs[ddepth];
6294     CV_Assert(func != 0);
6295
6296     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6297 }
6298
6299 #define PROCESS_SRC(src) \
6300     do \
6301     { \
6302         if (!src.empty()) \
6303         { \
6304             CV_Assert(src.isMat() || src.isUMat()); \
6305             Size csize = src.size(); \
6306             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6307                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6308             if (cwidth < ckercn || ckercn <= 0) \
6309                 return 1; \
6310             cols.push_back(cwidth); \
6311             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6312                 return 1; \
6313             offsets.push_back(src.offset()); \
6314             steps.push_back(src.step()); \
6315             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6316             kercns.push_back(ckercn); \
6317         } \
6318     } \
6319     while ((void)0, 0)
6320
6321 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6322                               InputArray src4, InputArray src5, InputArray src6,
6323                               InputArray src7, InputArray src8, InputArray src9,
6324                               OclVectorStrategy strat)
6325 {
6326     const ocl::Device & d = ocl::Device::getDefault();
6327
6328     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6329         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6330         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6331         d.preferredVectorWidthDouble(), -1 };
6332
6333     // if the device says don't use vectors
6334     if (vectorWidths[0] == 1)
6335     {
6336         // it's heuristic
6337         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6338         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6339         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6340     }
6341
6342     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6343 }
6344
6345 int checkOptimalVectorWidth(const int *vectorWidths,
6346                             InputArray src1, InputArray src2, InputArray src3,
6347                             InputArray src4, InputArray src5, InputArray src6,
6348                             InputArray src7, InputArray src8, InputArray src9,
6349                             OclVectorStrategy strat)
6350 {
6351     CV_Assert(vectorWidths);
6352
6353     int ref_type = src1.type();
6354
6355     std::vector<size_t> offsets, steps, cols;
6356     std::vector<int> dividers, kercns;
6357     PROCESS_SRC(src1);
6358     PROCESS_SRC(src2);
6359     PROCESS_SRC(src3);
6360     PROCESS_SRC(src4);
6361     PROCESS_SRC(src5);
6362     PROCESS_SRC(src6);
6363     PROCESS_SRC(src7);
6364     PROCESS_SRC(src8);
6365     PROCESS_SRC(src9);
6366
6367     size_t size = offsets.size();
6368
6369     for (size_t i = 0; i < size; ++i)
6370         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6371             dividers[i] >>= 1, kercns[i] >>= 1;
6372
6373     // default strategy
6374     int kercn = *std::min_element(kercns.begin(), kercns.end());
6375
6376     return kercn;
6377 }
6378
6379 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6380                                  InputArray src4, InputArray src5, InputArray src6,
6381                                  InputArray src7, InputArray src8, InputArray src9)
6382 {
6383     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6384 }
6385
6386 #undef PROCESS_SRC
6387
6388
6389 // TODO Make this as a method of OpenCL "BuildOptions" class
6390 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6391 {
6392     if (!buildOptions.empty())
6393         buildOptions += " ";
6394     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6395     buildOptions += format(
6396             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6397             name.c_str(), ocl::typeToStr(type),
6398             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6399             name.c_str(), (int)CV_MAT_CN(type),
6400             name.c_str(), (int)CV_ELEM_SIZE(type),
6401             name.c_str(), (int)CV_ELEM_SIZE1(type),
6402             name.c_str(), (int)depth
6403             );
6404 }
6405
6406
6407 struct Image2D::Impl
6408 {
6409     Impl(const UMat &src, bool norm, bool alias)
6410     {
6411         handle = 0;
6412         refcount = 1;
6413         init(src, norm, alias);
6414     }
6415
6416     ~Impl()
6417     {
6418         if (handle)
6419             clReleaseMemObject(handle);
6420     }
6421
6422     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6423     {
6424         cl_image_format format;
6425         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6426                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6427         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6428                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6429         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6430
6431         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6432         int channelOrder = channelOrders[cn];
6433         format.image_channel_data_type = (cl_channel_type)channelType;
6434         format.image_channel_order = (cl_channel_order)channelOrder;
6435         return format;
6436     }
6437
6438     static bool isFormatSupported(cl_image_format format)
6439     {
6440         if (!haveOpenCL())
6441             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6442
6443         cl_context context = (cl_context)Context::getDefault().ptr();
6444         if (!context)
6445             return false;
6446
6447         // Figure out how many formats are supported by this context.
6448         cl_uint numFormats = 0;
6449         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6450                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
6451                                                 NULL, &numFormats);
6452         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
6453         if (numFormats > 0)
6454         {
6455             AutoBuffer<cl_image_format> formats(numFormats);
6456             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6457                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
6458                                              formats.data(), NULL);
6459             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
6460             for (cl_uint i = 0; i < numFormats; ++i)
6461             {
6462                 if (!memcmp(&formats[i], &format, sizeof(format)))
6463                 {
6464                     return true;
6465                 }
6466             }
6467         }
6468         return false;
6469     }
6470
6471     void init(const UMat &src, bool norm, bool alias)
6472     {
6473         if (!haveOpenCL())
6474             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6475
6476         CV_Assert(!src.empty());
6477         CV_Assert(ocl::Device::getDefault().imageSupport());
6478
6479         int err, depth = src.depth(), cn = src.channels();
6480         CV_Assert(cn <= 4);
6481         cl_image_format format = getImageFormat(depth, cn, norm);
6482
6483         if (!isFormatSupported(format))
6484             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
6485
6486         if (alias && !src.handle(ACCESS_RW))
6487             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
6488
6489         cl_context context = (cl_context)Context::getDefault().ptr();
6490         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
6491
6492 #ifdef CL_VERSION_1_2
6493         // this enables backwards portability to
6494         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
6495         const Device & d = ocl::Device::getDefault();
6496         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
6497         CV_Assert(!alias || canCreateAlias(src));
6498         if (1 < major || (1 == major && 2 <= minor))
6499         {
6500             cl_image_desc desc;
6501             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
6502             desc.image_width      = src.cols;
6503             desc.image_height     = src.rows;
6504             desc.image_depth      = 0;
6505             desc.image_array_size = 1;
6506             desc.image_row_pitch  = alias ? src.step[0] : 0;
6507             desc.image_slice_pitch = 0;
6508             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
6509             desc.num_mip_levels   = 0;
6510             desc.num_samples      = 0;
6511             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
6512         }
6513         else
6514 #endif
6515         {
6516             CV_SUPPRESS_DEPRECATED_START
6517             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
6518             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
6519             CV_SUPPRESS_DEPRECATED_END
6520         }
6521         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
6522
6523         size_t origin[] = { 0, 0, 0 };
6524         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
6525
6526         cl_mem devData;
6527         if (!alias && !src.isContinuous())
6528         {
6529             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
6530             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
6531                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
6532                 ).c_str());
6533
6534             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
6535             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
6536                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
6537             CV_OCL_DBG_CHECK(clFlush(queue));
6538         }
6539         else
6540         {
6541             devData = (cl_mem)src.handle(ACCESS_READ);
6542         }
6543         CV_Assert(devData != NULL);
6544
6545         if (!alias)
6546         {
6547             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
6548             if (!src.isContinuous())
6549             {
6550                 CV_OCL_DBG_CHECK(clFlush(queue));
6551                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
6552             }
6553         }
6554     }
6555
6556     IMPLEMENT_REFCOUNTABLE();
6557
6558     cl_mem handle;
6559 };
6560
6561 Image2D::Image2D()
6562 {
6563     p = NULL;
6564 }
6565
6566 Image2D::Image2D(const UMat &src, bool norm, bool alias)
6567 {
6568     p = new Impl(src, norm, alias);
6569 }
6570
6571 bool Image2D::canCreateAlias(const UMat &m)
6572 {
6573     bool ret = false;
6574     const Device & d = ocl::Device::getDefault();
6575     if (d.imageFromBufferSupport() && !m.empty())
6576     {
6577         // This is the required pitch alignment in pixels
6578         uint pitchAlign = d.imagePitchAlignment();
6579         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
6580         {
6581             // We don't currently handle the case where the buffer was created
6582             // with CL_MEM_USE_HOST_PTR
6583             if (!m.u->tempUMat())
6584             {
6585                 ret = true;
6586             }
6587         }
6588     }
6589     return ret;
6590 }
6591
6592 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
6593 {
6594     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
6595
6596     return Impl::isFormatSupported(format);
6597 }
6598
6599 Image2D::Image2D(const Image2D & i)
6600 {
6601     p = i.p;
6602     if (p)
6603         p->addref();
6604 }
6605
6606 Image2D & Image2D::operator = (const Image2D & i)
6607 {
6608     if (i.p != p)
6609     {
6610         if (i.p)
6611             i.p->addref();
6612         if (p)
6613             p->release();
6614         p = i.p;
6615     }
6616     return *this;
6617 }
6618
6619 Image2D::~Image2D()
6620 {
6621     if (p)
6622         p->release();
6623 }
6624
6625 void* Image2D::ptr() const
6626 {
6627     return p ? p->handle : 0;
6628 }
6629
6630 bool internal::isOpenCLForced()
6631 {
6632     static bool initialized = false;
6633     static bool value = false;
6634     if (!initialized)
6635     {
6636         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
6637         initialized = true;
6638     }
6639     return value;
6640 }
6641
6642 bool internal::isPerformanceCheckBypassed()
6643 {
6644     static bool initialized = false;
6645     static bool value = false;
6646     if (!initialized)
6647     {
6648         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
6649         initialized = true;
6650     }
6651     return value;
6652 }
6653
6654 bool internal::isCLBuffer(UMat& u)
6655 {
6656     void* h = u.handle(ACCESS_RW);
6657     if (!h)
6658         return true;
6659     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
6660 #if 1
6661     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
6662         return false;
6663 #else
6664     cl_mem_object_type type = 0;
6665     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
6666     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
6667         return false;
6668 #endif
6669     return true;
6670 }
6671
6672 struct Timer::Impl
6673 {
6674     const Queue queue;
6675
6676     Impl(const Queue& q)
6677         : queue(q)
6678     {
6679     }
6680
6681     ~Impl(){}
6682
6683     void start()
6684     {
6685         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6686         timer.start();
6687     }
6688
6689     void stop()
6690     {
6691         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6692         timer.stop();
6693     }
6694
6695     uint64 durationNS() const
6696     {
6697         return (uint64)(timer.getTimeSec() * 1e9);
6698     }
6699
6700     TickMeter timer;
6701 };
6702
6703 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
6704 Timer::~Timer() { delete p; }
6705
6706 void Timer::start()
6707 {
6708     CV_Assert(p);
6709     p->start();
6710 }
6711
6712 void Timer::stop()
6713 {
6714     CV_Assert(p);
6715     p->stop();
6716 }
6717
6718 uint64 Timer::durationNS() const
6719 {
6720     CV_Assert(p);
6721     return p->durationNS();
6722 }
6723
6724 }} // namespace
6725
6726 #endif // HAVE_OPENCL