Merge pull request #18270 from komakai:swift-inout-arrays
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832
833 struct OpenCLExecutionContext::Impl
834 {
835     ocl::Context context_;
836     int device_;  // device index in context
837     ocl::Queue queue_;
838     int useOpenCL_;
839
840 protected:
841     Impl() = delete;
842
843     void _init_device(cl_device_id deviceID)
844     {
845         CV_Assert(deviceID);
846         int ndevices = (int)context_.ndevices();
847         CV_Assert(ndevices > 0);
848         bool found = false;
849         for (int i = 0; i < ndevices; i++)
850         {
851             ocl::Device d = context_.device(i);
852             cl_device_id dhandle = (cl_device_id)d.ptr();
853             if (dhandle == deviceID)
854             {
855                 device_ = i;
856                 found = true;
857                 break;
858             }
859         }
860         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
861     }
862
863     void _init_device(const ocl::Device& device)
864     {
865         CV_Assert(device.ptr());
866         int ndevices = (int)context_.ndevices();
867         CV_Assert(ndevices > 0);
868         bool found = false;
869         for (int i = 0; i < ndevices; i++)
870         {
871             ocl::Device d = context_.device(i);
872             if (d.getImpl() == device.getImpl())
873             {
874                 device_ = i;
875                 found = true;
876                 break;
877             }
878         }
879         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
880     }
881
882 public:
883     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
884         : device_(0), useOpenCL_(-1)
885     {
886         CV_UNUSED(platformID);
887         CV_Assert(context);
888         CV_Assert(deviceID);
889
890         context_ = Context::fromHandle(context);
891         _init_device(deviceID);
892         queue_ = Queue(context_, context_.device(device_));
893     }
894
895     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
896         : device_(0), useOpenCL_(-1)
897     {
898         CV_Assert(context.ptr());
899         CV_Assert(device.ptr());
900
901         context_ = context;
902         _init_device(device);
903         queue_ = queue;
904     }
905
906     Impl(const ocl::Context& context, const ocl::Device& device)
907         : device_(0), useOpenCL_(-1)
908     {
909         CV_Assert(context.ptr());
910         CV_Assert(device.ptr());
911
912         context_ = context;
913         _init_device(device);
914         queue_ = Queue(context_, context_.device(device_));
915     }
916
917     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
918         : context_(context)
919         , device_(device)
920         , queue_(queue)
921         , useOpenCL_(-1)
922     {
923         // nothing
924     }
925     Impl(const Impl& other)
926         : context_(other.context_)
927         , device_(other.device_)
928         , queue_(other.queue_)
929         , useOpenCL_(-1)
930     {
931         // nothing
932     }
933
934     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
935     bool useOpenCL()
936     {
937         if (useOpenCL_ < 0)
938         {
939             try
940             {
941                 useOpenCL_ = 0;
942                 if (!context_.empty() && context_.ndevices() > 0)
943                 {
944                     const Device& d = context_.device(device_);
945                     useOpenCL_ = d.available();
946                 }
947             }
948             catch (const cv::Exception&)
949             {
950                 // nothing
951             }
952             if (!useOpenCL_)
953                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
954         }
955         return useOpenCL_ > 0;
956     }
957
958     void setUseOpenCL(bool flag)
959     {
960         if (!flag)
961             useOpenCL_ = 0;
962         else
963             useOpenCL_ = -1;
964     }
965
966     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
967     {
968         CV_TRACE_FUNCTION();
969
970         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
971
972         static bool initialized = false;
973         static std::shared_ptr<Impl> g_primaryExecutionContext;
974
975         if (!initialized)
976         {
977             cv::AutoLock lock(getInitializationMutex());
978             if (!initialized)
979             {
980                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
981                 try
982                 {
983                     Context c = ocl::Context::create(std::string());
984                     if (c.ndevices())
985                     {
986                         int deviceId = 0;
987                         auto& d = c.device(deviceId);
988                         if (d.available())
989                         {
990                             auto q = ocl::Queue(c, d);
991                             if (!q.ptr())
992                             {
993                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
994                             }
995                             else
996                             {
997                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
998                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
999                             }
1000                         }
1001                         else
1002                         {
1003                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1004                         }
1005                     }
1006                     else
1007                     {
1008                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1009                     }
1010                 }
1011                 catch (const std::exception& e)
1012                 {
1013                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1014                 }
1015                 catch (...)
1016                 {
1017                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1018                 }
1019                 initialized = true;
1020             }
1021         }
1022         return g_primaryExecutionContext;
1023     }
1024 };
1025
1026 Context& OpenCLExecutionContext::getContext() const
1027 {
1028     CV_Assert(p);
1029     return p->context_;
1030 }
1031 Device& OpenCLExecutionContext::getDevice() const
1032 {
1033     CV_Assert(p);
1034     return p->context_.device(p->device_);
1035 }
1036 Queue& OpenCLExecutionContext::getQueue() const
1037 {
1038     CV_Assert(p);
1039     return p->queue_;
1040 }
1041
1042 bool OpenCLExecutionContext::useOpenCL() const
1043 {
1044     if (p)
1045         return p->useOpenCL();
1046     return false;
1047 }
1048 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1049 {
1050     CV_Assert(p);
1051     p->setUseOpenCL(flag);
1052 }
1053
1054 /* static */
1055 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1056 {
1057     CV_TRACE_FUNCTION();
1058     CoreTLSData& data = getCoreTlsData();
1059     OpenCLExecutionContext& c = data.oclExecutionContext;
1060     if (!data.oclExecutionContextInitialized)
1061     {
1062         data.oclExecutionContextInitialized = true;
1063         if (c.empty() && haveOpenCL())
1064             c.p = Impl::getInitializedExecutionContext();
1065     }
1066     return c;
1067 }
1068
1069 /* static */
1070 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1071 {
1072     CV_TRACE_FUNCTION();
1073     CoreTLSData& data = getCoreTlsData();
1074     OpenCLExecutionContext& c = data.oclExecutionContext;
1075     return c;
1076 }
1077
1078 void OpenCLExecutionContext::bind() const
1079 {
1080     CV_TRACE_FUNCTION();
1081     CV_Assert(p);
1082     CoreTLSData& data = getCoreTlsData();
1083     data.oclExecutionContext = *this;
1084     data.oclExecutionContextInitialized = true;
1085     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1086 }
1087
1088
1089 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1090 {
1091     CV_TRACE_FUNCTION();
1092     CV_Assert(p);
1093     const Queue q(getContext(), getDevice());
1094     return cloneWithNewQueue(q);
1095 }
1096
1097 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1098 {
1099     CV_TRACE_FUNCTION();
1100     CV_Assert(p);
1101     CV_Assert(q.ptr() != NULL);
1102     OpenCLExecutionContext c;
1103     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1104     return c;
1105 }
1106
1107 /* static */
1108 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1109 {
1110     CV_TRACE_FUNCTION();
1111     if (!haveOpenCL())
1112         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1113
1114     CV_Assert(!context.empty());
1115     CV_Assert(context.ptr());
1116     CV_Assert(!device.empty());
1117     CV_Assert(device.ptr());
1118     OpenCLExecutionContext ctx;
1119     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1120     return ctx;
1121
1122 }
1123
1124 /* static */
1125 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1126 {
1127     CV_TRACE_FUNCTION();
1128     if (!haveOpenCL())
1129         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1130
1131     CV_Assert(!context.empty());
1132     CV_Assert(context.ptr());
1133     CV_Assert(!device.empty());
1134     CV_Assert(device.ptr());
1135     OpenCLExecutionContext ctx;
1136     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1137     return ctx;
1138
1139 }
1140
1141 void OpenCLExecutionContext::release()
1142 {
1143     CV_TRACE_FUNCTION();
1144     p.reset();
1145 }
1146
1147
1148 // true if we have initialized OpenCL subsystem with available platforms
1149 static bool g_isOpenCVActivated = false;
1150
1151 bool haveOpenCL()
1152 {
1153     CV_TRACE_FUNCTION();
1154     static bool g_isOpenCLInitialized = false;
1155     static bool g_isOpenCLAvailable = false;
1156
1157     if (!g_isOpenCLInitialized)
1158     {
1159         CV_TRACE_REGION("Init_OpenCL_Runtime");
1160         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1161         if (envPath)
1162         {
1163             if (cv::String(envPath) == "disabled")
1164             {
1165                 g_isOpenCLAvailable = false;
1166                 g_isOpenCLInitialized = true;
1167                 return false;
1168             }
1169         }
1170
1171         cv::AutoLock lock(getInitializationMutex());
1172         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1173         try
1174         {
1175             cl_uint n = 0;
1176             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1177             g_isOpenCVActivated = n > 0;
1178             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1179         }
1180         catch (...)
1181         {
1182             g_isOpenCLAvailable = false;
1183         }
1184         g_isOpenCLInitialized = true;
1185     }
1186     return g_isOpenCLAvailable;
1187 }
1188
1189 bool useOpenCL()
1190 {
1191     CoreTLSData& data = getCoreTlsData();
1192     if (data.useOpenCL < 0)
1193     {
1194         try
1195         {
1196             data.useOpenCL = 0;
1197             if (haveOpenCL())
1198             {
1199                 auto c = OpenCLExecutionContext::getCurrent();
1200                 data.useOpenCL = c.useOpenCL();
1201             }
1202         }
1203         catch (...)
1204         {
1205             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1206         }
1207     }
1208     return data.useOpenCL > 0;
1209 }
1210
1211 bool isOpenCLActivated()
1212 {
1213     if (!g_isOpenCVActivated)
1214         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1215     return useOpenCL();
1216 }
1217
1218 void setUseOpenCL(bool flag)
1219 {
1220     CV_TRACE_FUNCTION();
1221
1222     CoreTLSData& data = getCoreTlsData();
1223     auto& c = OpenCLExecutionContext::getCurrentRef();
1224     if (!c.empty())
1225     {
1226         c.setUseOpenCL(flag);
1227         data.useOpenCL = c.useOpenCL();
1228     }
1229     else
1230     {
1231         if (!flag)
1232             data.useOpenCL = 0;
1233         else
1234             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1235     }
1236 }
1237
1238
1239
1240 #ifdef HAVE_CLAMDBLAS
1241
1242 class AmdBlasHelper
1243 {
1244 public:
1245     static AmdBlasHelper & getInstance()
1246     {
1247         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1248     }
1249
1250     bool isAvailable() const
1251     {
1252         return g_isAmdBlasAvailable;
1253     }
1254
1255     ~AmdBlasHelper()
1256     {
1257         try
1258         {
1259             clAmdBlasTeardown();
1260         }
1261         catch (...) { }
1262     }
1263
1264 protected:
1265     AmdBlasHelper()
1266     {
1267         if (!g_isAmdBlasInitialized)
1268         {
1269             AutoLock lock(getInitializationMutex());
1270
1271             if (!g_isAmdBlasInitialized)
1272             {
1273                 if (haveOpenCL())
1274                 {
1275                     try
1276                     {
1277                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1278                     }
1279                     catch (...)
1280                     {
1281                         g_isAmdBlasAvailable = false;
1282                     }
1283                 }
1284                 else
1285                     g_isAmdBlasAvailable = false;
1286
1287                 g_isAmdBlasInitialized = true;
1288             }
1289         }
1290     }
1291
1292 private:
1293     static bool g_isAmdBlasInitialized;
1294     static bool g_isAmdBlasAvailable;
1295 };
1296
1297 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1298 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1299
1300 bool haveAmdBlas()
1301 {
1302     return AmdBlasHelper::getInstance().isAvailable();
1303 }
1304
1305 #else
1306
1307 bool haveAmdBlas()
1308 {
1309     return false;
1310 }
1311
1312 #endif
1313
1314 #ifdef HAVE_CLAMDFFT
1315
1316 class AmdFftHelper
1317 {
1318 public:
1319     static AmdFftHelper & getInstance()
1320     {
1321         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1322     }
1323
1324     bool isAvailable() const
1325     {
1326         return g_isAmdFftAvailable;
1327     }
1328
1329     ~AmdFftHelper()
1330     {
1331         try
1332         {
1333 //            clAmdFftTeardown();
1334         }
1335         catch (...) { }
1336     }
1337
1338 protected:
1339     AmdFftHelper()
1340     {
1341         if (!g_isAmdFftInitialized)
1342         {
1343             AutoLock lock(getInitializationMutex());
1344
1345             if (!g_isAmdFftInitialized)
1346             {
1347                 if (haveOpenCL())
1348                 {
1349                     try
1350                     {
1351                         cl_uint major, minor, patch;
1352                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1353
1354                         // it throws exception in case AmdFft binaries are not found
1355                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1356                         g_isAmdFftAvailable = true;
1357                     }
1358                     catch (const Exception &)
1359                     {
1360                         g_isAmdFftAvailable = false;
1361                     }
1362                 }
1363                 else
1364                     g_isAmdFftAvailable = false;
1365
1366                 g_isAmdFftInitialized = true;
1367             }
1368         }
1369     }
1370
1371 private:
1372     static clAmdFftSetupData setupData;
1373     static bool g_isAmdFftInitialized;
1374     static bool g_isAmdFftAvailable;
1375 };
1376
1377 clAmdFftSetupData AmdFftHelper::setupData;
1378 bool AmdFftHelper::g_isAmdFftAvailable = false;
1379 bool AmdFftHelper::g_isAmdFftInitialized = false;
1380
1381 bool haveAmdFft()
1382 {
1383     return AmdFftHelper::getInstance().isAvailable();
1384 }
1385
1386 #else
1387
1388 bool haveAmdFft()
1389 {
1390     return false;
1391 }
1392
1393 #endif
1394
1395 bool haveSVM()
1396 {
1397 #ifdef HAVE_OPENCL_SVM
1398     return true;
1399 #else
1400     return false;
1401 #endif
1402 }
1403
1404 void finish()
1405 {
1406     Queue::getDefault().finish();
1407 }
1408
1409 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1410
1411 struct Platform::Impl
1412 {
1413     Impl()
1414     {
1415         refcount = 1;
1416         handle = 0;
1417         initialized = false;
1418     }
1419
1420     ~Impl() {}
1421
1422     void init()
1423     {
1424         if( !initialized )
1425         {
1426             //cl_uint num_entries
1427             cl_uint n = 0;
1428             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1429                 handle = 0;
1430             if( handle != 0 )
1431             {
1432                 char buf[1000];
1433                 size_t len = 0;
1434                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1435                 buf[len] = '\0';
1436                 vendor = String(buf);
1437             }
1438
1439             initialized = true;
1440         }
1441     }
1442
1443     IMPLEMENT_REFCOUNTABLE();
1444
1445     cl_platform_id handle;
1446     String vendor;
1447     bool initialized;
1448 };
1449
1450 Platform::Platform()
1451 {
1452     p = 0;
1453 }
1454
1455 Platform::~Platform()
1456 {
1457     if(p)
1458         p->release();
1459 }
1460
1461 Platform::Platform(const Platform& pl)
1462 {
1463     p = (Impl*)pl.p;
1464     if(p)
1465         p->addref();
1466 }
1467
1468 Platform& Platform::operator = (const Platform& pl)
1469 {
1470     Impl* newp = (Impl*)pl.p;
1471     if(newp)
1472         newp->addref();
1473     if(p)
1474         p->release();
1475     p = newp;
1476     return *this;
1477 }
1478
1479 void* Platform::ptr() const
1480 {
1481     return p ? p->handle : 0;
1482 }
1483
1484 Platform& Platform::getDefault()
1485 {
1486     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1487     static Platform p;
1488     if( !p.p )
1489     {
1490         p.p = new Impl;
1491         p.p->init();
1492     }
1493     return p;
1494 }
1495
1496 /////////////////////////////////////// Device ////////////////////////////////////////////
1497
1498 // deviceVersion has format
1499 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1500 // by specification
1501 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1502 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1503 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1504 {
1505     major = minor = 0;
1506     if (10 >= deviceVersion.length())
1507         return;
1508     const char *pstr = deviceVersion.c_str();
1509     if (0 != strncmp(pstr, "OpenCL ", 7))
1510         return;
1511     size_t ppos = deviceVersion.find('.', 7);
1512     if (String::npos == ppos)
1513         return;
1514     String temp = deviceVersion.substr(7, ppos - 7);
1515     major = atoi(temp.c_str());
1516     temp = deviceVersion.substr(ppos + 1);
1517     minor = atoi(temp.c_str());
1518 }
1519
1520 struct Device::Impl
1521 {
1522     Impl(void* d)
1523         : refcount(1)
1524         , handle(0)
1525     {
1526         try
1527         {
1528             cl_device_id device = (cl_device_id)d;
1529             _init(device);
1530             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1531         }
1532         catch (...)
1533         {
1534             throw;
1535         }
1536     }
1537
1538     void _init(cl_device_id d)
1539     {
1540         handle = (cl_device_id)d;
1541
1542         name_ = getStrProp(CL_DEVICE_NAME);
1543         version_ = getStrProp(CL_DEVICE_VERSION);
1544         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1545         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1546         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1547         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1548         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1549         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1550         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1551         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1552
1553         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1554         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1555
1556         size_t pos = 0;
1557         while (pos < extensions_.size())
1558         {
1559             size_t pos2 = extensions_.find(' ', pos);
1560             if (pos2 == String::npos)
1561                 pos2 = extensions_.size();
1562             if (pos2 > pos)
1563             {
1564                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1565                 extensions_set_.insert(extensionName);
1566             }
1567             pos = pos2 + 1;
1568         }
1569
1570         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1571
1572         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1573         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1574             vendorName_ == "AMD")
1575             vendorID_ = VENDOR_AMD;
1576         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1577             vendorID_ = VENDOR_INTEL;
1578         else if (vendorName_ == "NVIDIA Corporation")
1579             vendorID_ = VENDOR_NVIDIA;
1580         else
1581             vendorID_ = UNKNOWN_VENDOR;
1582
1583         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1584         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1585         {
1586             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1587             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1588                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1589             maxWorkGroupSize_ = new_maxWorkGroupSize;
1590         }
1591 #if 0
1592         if (isExtensionSupported("cl_khr_spir"))
1593         {
1594 #ifndef CL_DEVICE_SPIR_VERSIONS
1595 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1596 #endif
1597             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1598             std::cout << spir_versions << std::endl;
1599         }
1600 #endif
1601     }
1602
1603     ~Impl()
1604     {
1605 #ifdef _WIN32
1606         if (!cv::__termination)
1607 #endif
1608         {
1609             if (handle)
1610             {
1611                 CV_OCL_CHECK(clReleaseDevice(handle));
1612                 handle = 0;
1613             }
1614         }
1615     }
1616
1617     template<typename _TpCL, typename _TpOut>
1618     _TpOut getProp(cl_device_info prop) const
1619     {
1620         _TpCL temp=_TpCL();
1621         size_t sz = 0;
1622
1623         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1624             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1625     }
1626
1627     bool getBoolProp(cl_device_info prop) const
1628     {
1629         cl_bool temp = CL_FALSE;
1630         size_t sz = 0;
1631
1632         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1633             sz == sizeof(temp) ? temp != 0 : false;
1634     }
1635
1636     String getStrProp(cl_device_info prop) const
1637     {
1638         char buf[4096];
1639         size_t sz=0;
1640         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1641             sz < sizeof(buf) ? String(buf) : String();
1642     }
1643
1644     bool isExtensionSupported(const std::string& extensionName) const
1645     {
1646         return extensions_set_.count(extensionName) > 0;
1647     }
1648
1649
1650     IMPLEMENT_REFCOUNTABLE();
1651
1652     cl_device_id handle;
1653
1654     String name_;
1655     String version_;
1656     std::string extensions_;
1657     int doubleFPConfig_;
1658     bool hostUnifiedMemory_;
1659     int maxComputeUnits_;
1660     size_t maxWorkGroupSize_;
1661     int type_;
1662     int addressBits_;
1663     int deviceVersionMajor_;
1664     int deviceVersionMinor_;
1665     String driverVersion_;
1666     String vendorName_;
1667     int vendorID_;
1668     bool intelSubgroupsSupport_;
1669
1670     std::set<std::string> extensions_set_;
1671 };
1672
1673
1674 Device::Device()
1675 {
1676     p = 0;
1677 }
1678
1679 Device::Device(void* d)
1680 {
1681     p = 0;
1682     set(d);
1683 }
1684
1685 Device::Device(const Device& d)
1686 {
1687     p = d.p;
1688     if(p)
1689         p->addref();
1690 }
1691
1692 Device& Device::operator = (const Device& d)
1693 {
1694     Impl* newp = (Impl*)d.p;
1695     if(newp)
1696         newp->addref();
1697     if(p)
1698         p->release();
1699     p = newp;
1700     return *this;
1701 }
1702
1703 Device::~Device()
1704 {
1705     if(p)
1706         p->release();
1707 }
1708
1709 void Device::set(void* d)
1710 {
1711     if(p)
1712         p->release();
1713     p = new Impl(d);
1714     if (p->handle)
1715     {
1716         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1717     }
1718 }
1719
1720 Device Device::fromHandle(void* d)
1721 {
1722     Device device(d);
1723     return device;
1724 }
1725
1726 void* Device::ptr() const
1727 {
1728     return p ? p->handle : 0;
1729 }
1730
1731 String Device::name() const
1732 { return p ? p->name_ : String(); }
1733
1734 String Device::extensions() const
1735 { return p ? String(p->extensions_) : String(); }
1736
1737 bool Device::isExtensionSupported(const String& extensionName) const
1738 { return p ? p->isExtensionSupported(extensionName) : false; }
1739
1740 String Device::version() const
1741 { return p ? p->version_ : String(); }
1742
1743 String Device::vendorName() const
1744 { return p ? p->vendorName_ : String(); }
1745
1746 int Device::vendorID() const
1747 { return p ? p->vendorID_ : 0; }
1748
1749 String Device::OpenCL_C_Version() const
1750 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1751
1752 String Device::OpenCLVersion() const
1753 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1754
1755 int Device::deviceVersionMajor() const
1756 { return p ? p->deviceVersionMajor_ : 0; }
1757
1758 int Device::deviceVersionMinor() const
1759 { return p ? p->deviceVersionMinor_ : 0; }
1760
1761 String Device::driverVersion() const
1762 { return p ? p->driverVersion_ : String(); }
1763
1764 int Device::type() const
1765 { return p ? p->type_ : 0; }
1766
1767 int Device::addressBits() const
1768 { return p ? p->addressBits_ : 0; }
1769
1770 bool Device::available() const
1771 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1772
1773 bool Device::compilerAvailable() const
1774 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1775
1776 bool Device::linkerAvailable() const
1777 #ifdef CL_VERSION_1_2
1778 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1779 #else
1780 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1781 #endif
1782
1783 int Device::doubleFPConfig() const
1784 { return p ? p->doubleFPConfig_ : 0; }
1785
1786 int Device::singleFPConfig() const
1787 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1788
1789 int Device::halfFPConfig() const
1790 #ifdef CL_VERSION_1_2
1791 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1792 #else
1793 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1794 #endif
1795
1796 bool Device::endianLittle() const
1797 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1798
1799 bool Device::errorCorrectionSupport() const
1800 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1801
1802 int Device::executionCapabilities() const
1803 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1804
1805 size_t Device::globalMemCacheSize() const
1806 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1807
1808 int Device::globalMemCacheType() const
1809 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1810
1811 int Device::globalMemCacheLineSize() const
1812 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1813
1814 size_t Device::globalMemSize() const
1815 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1816
1817 size_t Device::localMemSize() const
1818 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1819
1820 int Device::localMemType() const
1821 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1822
1823 bool Device::hostUnifiedMemory() const
1824 { return p ? p->hostUnifiedMemory_ : false; }
1825
1826 bool Device::imageSupport() const
1827 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1828
1829 bool Device::imageFromBufferSupport() const
1830 {
1831     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1832 }
1833
1834 uint Device::imagePitchAlignment() const
1835 {
1836 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1837     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1838 #else
1839     return 0;
1840 #endif
1841 }
1842
1843 uint Device::imageBaseAddressAlignment() const
1844 {
1845 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1846     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1847 #else
1848     return 0;
1849 #endif
1850 }
1851
1852 size_t Device::image2DMaxWidth() const
1853 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1854
1855 size_t Device::image2DMaxHeight() const
1856 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1857
1858 size_t Device::image3DMaxWidth() const
1859 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1860
1861 size_t Device::image3DMaxHeight() const
1862 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1863
1864 size_t Device::image3DMaxDepth() const
1865 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1866
1867 size_t Device::imageMaxBufferSize() const
1868 #ifdef CL_VERSION_1_2
1869 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1870 #else
1871 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1872 #endif
1873
1874 size_t Device::imageMaxArraySize() const
1875 #ifdef CL_VERSION_1_2
1876 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1877 #else
1878 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1879 #endif
1880
1881 bool Device::intelSubgroupsSupport() const
1882 { return p ? p->intelSubgroupsSupport_ : false; }
1883
1884 int Device::maxClockFrequency() const
1885 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1886
1887 int Device::maxComputeUnits() const
1888 { return p ? p->maxComputeUnits_ : 0; }
1889
1890 int Device::maxConstantArgs() const
1891 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1892
1893 size_t Device::maxConstantBufferSize() const
1894 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1895
1896 size_t Device::maxMemAllocSize() const
1897 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1898
1899 size_t Device::maxParameterSize() const
1900 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1901
1902 int Device::maxReadImageArgs() const
1903 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1904
1905 int Device::maxWriteImageArgs() const
1906 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1907
1908 int Device::maxSamplers() const
1909 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1910
1911 size_t Device::maxWorkGroupSize() const
1912 { return p ? p->maxWorkGroupSize_ : 0; }
1913
1914 int Device::maxWorkItemDims() const
1915 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1916
1917 void Device::maxWorkItemSizes(size_t* sizes) const
1918 {
1919     if(p)
1920     {
1921         const int MAX_DIMS = 32;
1922         size_t retsz = 0;
1923         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1924                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1925     }
1926 }
1927
1928 int Device::memBaseAddrAlign() const
1929 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1930
1931 int Device::nativeVectorWidthChar() const
1932 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1933
1934 int Device::nativeVectorWidthShort() const
1935 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1936
1937 int Device::nativeVectorWidthInt() const
1938 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1939
1940 int Device::nativeVectorWidthLong() const
1941 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1942
1943 int Device::nativeVectorWidthFloat() const
1944 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1945
1946 int Device::nativeVectorWidthDouble() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1948
1949 int Device::nativeVectorWidthHalf() const
1950 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1951
1952 int Device::preferredVectorWidthChar() const
1953 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1954
1955 int Device::preferredVectorWidthShort() const
1956 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1957
1958 int Device::preferredVectorWidthInt() const
1959 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1960
1961 int Device::preferredVectorWidthLong() const
1962 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1963
1964 int Device::preferredVectorWidthFloat() const
1965 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1966
1967 int Device::preferredVectorWidthDouble() const
1968 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1969
1970 int Device::preferredVectorWidthHalf() const
1971 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1972
1973 size_t Device::printfBufferSize() const
1974 #ifdef CL_VERSION_1_2
1975 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1976 #else
1977 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1978 #endif
1979
1980
1981 size_t Device::profilingTimerResolution() const
1982 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1983
1984 const Device& Device::getDefault()
1985 {
1986     auto& c = OpenCLExecutionContext::getCurrent();
1987     if (!c.empty())
1988     {
1989         return c.getDevice();
1990     }
1991
1992     static Device dummy;
1993     return dummy;
1994 }
1995
1996 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1997
1998 template <typename Functor, typename ObjectType>
1999 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2000 {
2001     ::size_t required;
2002     cl_int err = f(obj, name, 0, NULL, &required);
2003     if (err != CL_SUCCESS)
2004         return err;
2005
2006     param.clear();
2007     if (required > 0)
2008     {
2009         AutoBuffer<char> buf(required + 1);
2010         char* ptr = buf.data(); // cleanup is not needed
2011         err = f(obj, name, required, ptr, NULL);
2012         if (err != CL_SUCCESS)
2013             return err;
2014         param = ptr;
2015     }
2016
2017     return CL_SUCCESS;
2018 }
2019
2020 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2021 {
2022     elems.clear();
2023     if (s.size() == 0)
2024         return;
2025     std::istringstream ss(s);
2026     std::string item;
2027     while (!ss.eof())
2028     {
2029         std::getline(ss, item, delim);
2030         elems.push_back(item);
2031     }
2032 }
2033
2034 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2035 // Sample: AMD:GPU:
2036 // Sample: AMD:GPU:Tahiti
2037 // Sample: :GPU|CPU: = '' = ':' = '::'
2038 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2039         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2040 {
2041     std::vector<std::string> parts;
2042     split(configurationStr, ':', parts);
2043     if (parts.size() > 3)
2044     {
2045         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2046         return false;
2047     }
2048     if (parts.size() > 2)
2049         deviceNameOrID = parts[2];
2050     if (parts.size() > 1)
2051     {
2052         split(parts[1], '|', deviceTypes);
2053     }
2054     if (parts.size() > 0)
2055     {
2056         platform = parts[0];
2057     }
2058     return true;
2059 }
2060
2061 #if defined WINRT || defined _WIN32_WCE
2062 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2063 {
2064     CV_UNUSED(configuration)
2065     return NULL;
2066 }
2067 #else
2068 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2069 {
2070     std::string platform, deviceName;
2071     std::vector<std::string> deviceTypes;
2072
2073     if (!configuration)
2074         configuration = getenv("OPENCV_OPENCL_DEVICE");
2075
2076     if (configuration &&
2077             (strcmp(configuration, "disabled") == 0 ||
2078              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2079             ))
2080         return NULL;
2081
2082     bool isID = false;
2083     int deviceID = -1;
2084     if (deviceName.length() == 1)
2085     // We limit ID range to 0..9, because we want to write:
2086     // - '2500' to mean i5-2500
2087     // - '8350' to mean AMD FX-8350
2088     // - '650' to mean GeForce 650
2089     // To extend ID range change condition to '> 0'
2090     {
2091         isID = true;
2092         for (size_t i = 0; i < deviceName.length(); i++)
2093         {
2094             if (!isdigit(deviceName[i]))
2095             {
2096                 isID = false;
2097                 break;
2098             }
2099         }
2100         if (isID)
2101         {
2102             deviceID = atoi(deviceName.c_str());
2103             if (deviceID < 0)
2104                 return NULL;
2105         }
2106     }
2107
2108     std::vector<cl_platform_id> platforms;
2109     {
2110         cl_uint numPlatforms = 0;
2111         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2112
2113         if (numPlatforms == 0)
2114             return NULL;
2115         platforms.resize((size_t)numPlatforms);
2116         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2117         platforms.resize(numPlatforms);
2118     }
2119
2120     int selectedPlatform = -1;
2121     if (platform.length() > 0)
2122     {
2123         for (size_t i = 0; i < platforms.size(); i++)
2124         {
2125             std::string name;
2126             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2127             if (name.find(platform) != std::string::npos)
2128             {
2129                 selectedPlatform = (int)i;
2130                 break;
2131             }
2132         }
2133         if (selectedPlatform == -1)
2134         {
2135             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2136             goto not_found;
2137         }
2138     }
2139     if (deviceTypes.size() == 0)
2140     {
2141         if (!isID)
2142         {
2143             deviceTypes.push_back("GPU");
2144             if (configuration)
2145                 deviceTypes.push_back("CPU");
2146         }
2147         else
2148             deviceTypes.push_back("ALL");
2149     }
2150     for (size_t t = 0; t < deviceTypes.size(); t++)
2151     {
2152         int deviceType = 0;
2153         std::string tempStrDeviceType = deviceTypes[t];
2154         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2155
2156         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2157             deviceType = Device::TYPE_GPU;
2158         else if (tempStrDeviceType == "cpu")
2159             deviceType = Device::TYPE_CPU;
2160         else if (tempStrDeviceType == "accelerator")
2161             deviceType = Device::TYPE_ACCELERATOR;
2162         else if (tempStrDeviceType == "all")
2163             deviceType = Device::TYPE_ALL;
2164         else
2165         {
2166             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2167             goto not_found;
2168         }
2169
2170         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2171         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2172                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2173                 i++)
2174         {
2175             cl_uint count = 0;
2176             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2177             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2178             {
2179                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2180             }
2181             if (count == 0)
2182                 continue;
2183             size_t base = devices.size();
2184             devices.resize(base + count);
2185             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2186             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2187             {
2188                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2189             }
2190         }
2191
2192         for (size_t i = (isID ? deviceID : 0);
2193              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2194              i++)
2195         {
2196             std::string name;
2197             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2198             cl_bool useGPU = true;
2199             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2200             {
2201                 cl_bool isIGPU = CL_FALSE;
2202                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2203                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2204             }
2205             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2206             {
2207                 // TODO check for OpenCL 1.1
2208                 return devices[i];
2209             }
2210         }
2211     }
2212
2213 not_found:
2214     if (!configuration)
2215         return NULL; // suppress messages on stderr
2216
2217     std::ostringstream msg;
2218     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2219         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2220         << "    Device types:";
2221     for (size_t t = 0; t < deviceTypes.size(); t++)
2222         msg << ' ' << deviceTypes[t];
2223
2224     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2225
2226     CV_LOG_ERROR(NULL, msg.str());
2227     return NULL;
2228 }
2229 #endif
2230
2231 #ifdef HAVE_OPENCL_SVM
2232 namespace svm {
2233
2234 enum AllocatorFlags { // don't use first 16 bits
2235         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2236         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2237         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2238         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2239         OPENCL_SVM_BUFFER_MAP = 4 << 16
2240 };
2241
2242 static bool checkForceSVMUmatUsage()
2243 {
2244     static bool initialized = false;
2245     static bool force = false;
2246     if (!initialized)
2247     {
2248         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2249         initialized = true;
2250     }
2251     return force;
2252 }
2253 static bool checkDisableSVMUMatUsage()
2254 {
2255     static bool initialized = false;
2256     static bool force = false;
2257     if (!initialized)
2258     {
2259         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2260         initialized = true;
2261     }
2262     return force;
2263 }
2264 static bool checkDisableSVM()
2265 {
2266     static bool initialized = false;
2267     static bool force = false;
2268     if (!initialized)
2269     {
2270         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2271         initialized = true;
2272     }
2273     return force;
2274 }
2275 // see SVMCapabilities
2276 static unsigned int getSVMCapabilitiesMask()
2277 {
2278     static bool initialized = false;
2279     static unsigned int mask = 0;
2280     if (!initialized)
2281     {
2282         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2283         if (envValue == NULL)
2284         {
2285             return ~0U; // all bits 1
2286         }
2287         mask = atoi(envValue);
2288         initialized = true;
2289     }
2290     return mask;
2291 }
2292 } // namespace
2293 #endif
2294
2295 static size_t getProgramCountLimit()
2296 {
2297     static bool initialized = false;
2298     static size_t count = 0;
2299     if (!initialized)
2300     {
2301         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2302         initialized = true;
2303     }
2304     return count;
2305 }
2306
2307 static int g_contextId = 0;
2308
2309 class OpenCLBufferPoolImpl;
2310 class OpenCLSVMBufferPoolImpl;
2311
2312 struct Context::Impl
2313 {
2314     static Context::Impl* get(Context& context) { return context.p; }
2315
2316     typedef std::deque<Context::Impl*> container_t;
2317     static container_t& getGlobalContainer()
2318     {
2319         // never delete this container (Impl lifetime is greater due to TLS storage)
2320         static container_t* g_contexts = new container_t();
2321         return *g_contexts;
2322     }
2323
2324 protected:
2325     Impl(const std::string& configuration_)
2326         : refcount(1)
2327         , contextId(CV_XADD(&g_contextId, 1))
2328         , configuration(configuration_)
2329         , handle(0)
2330 #ifdef HAVE_OPENCL_SVM
2331         , svmInitialized(false)
2332 #endif
2333     {
2334         if (!haveOpenCL())
2335             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2336
2337         cv::AutoLock lock(cv::getInitializationMutex());
2338         auto& container = getGlobalContainer();
2339         container.resize(std::max(container.size(), (size_t)contextId + 1));
2340         container[contextId] = this;
2341     }
2342
2343     ~Impl()
2344     {
2345 #ifdef _WIN32
2346         if (!cv::__termination)
2347 #endif
2348         {
2349             if (handle)
2350             {
2351                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2352                 handle = NULL;
2353             }
2354             devices.clear();
2355         }
2356
2357         {
2358             cv::AutoLock lock(cv::getInitializationMutex());
2359             auto& container = getGlobalContainer();
2360             CV_CheckLT((size_t)contextId, container.size(), "");
2361             container[contextId] = NULL;
2362         }
2363     }
2364
2365     void init_device_list()
2366     {
2367         CV_Assert(handle);
2368
2369         cl_uint ndevices = 0;
2370         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2371         CV_Assert(ndevices > 0);
2372
2373         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2374         size_t devices_ret_size = 0;
2375         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2376         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2377
2378         devices.clear();
2379         for (unsigned i = 0; i < ndevices; i++)
2380         {
2381             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2382         }
2383     }
2384
2385     void __init_buffer_pools();  // w/o synchronization
2386     void _init_buffer_pools() const
2387     {
2388         if (!bufferPool_)
2389         {
2390             cv::AutoLock lock(cv::getInitializationMutex());
2391             if (!bufferPool_)
2392             {
2393                 const_cast<Impl*>(this)->__init_buffer_pools();
2394             }
2395         }
2396     }
2397 public:
2398     static Impl* findContext(const std::string& configuration)
2399     {
2400         CV_TRACE_FUNCTION();
2401         cv::AutoLock lock(cv::getInitializationMutex());
2402         auto& container = getGlobalContainer();
2403         if (configuration.empty() && !container.empty())
2404             return container[0];
2405         for (auto it = container.begin(); it != container.end(); ++it)
2406         {
2407             Impl* i = *it;
2408             if (i && i->configuration == configuration)
2409             {
2410                 return i;
2411             }
2412         }
2413         return NULL;
2414     }
2415
2416     static Impl* findOrCreateContext(const std::string& configuration_)
2417     {
2418         CV_TRACE_FUNCTION();
2419         std::string configuration = configuration_;
2420         if (configuration_.empty())
2421         {
2422             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2423             if (c)
2424                 configuration = c;
2425         }
2426         Impl* impl = findContext(configuration);
2427         if (impl)
2428         {
2429             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2430             return impl;
2431         }
2432
2433         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2434         if (d == NULL)
2435             return NULL;
2436
2437         impl = new Impl(configuration);
2438         try
2439         {
2440             impl->createFromDevice(d);
2441             if (impl->handle)
2442                 return impl;
2443             delete impl;
2444             return NULL;
2445         }
2446         catch (...)
2447         {
2448             delete impl;
2449             throw;
2450         }
2451     }
2452
2453     static Impl* findOrCreateContext(cl_context h)
2454     {
2455         CV_TRACE_FUNCTION();
2456
2457         CV_Assert(h);
2458
2459         std::string configuration = cv::format("@ctx-%p", (void*)h);
2460         Impl* impl = findContext(configuration);
2461         if (impl)
2462         {
2463             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2464             impl->addref();
2465             return impl;
2466         }
2467
2468         impl = new Impl(configuration);
2469         try
2470         {
2471             CV_OCL_CHECK(clRetainContext(h));
2472             impl->handle = h;
2473             impl->init_device_list();
2474             return impl;
2475         }
2476         catch (...)
2477         {
2478             delete impl;
2479             throw;
2480         }
2481     }
2482
2483     static Impl* findOrCreateContext(const ocl::Device& device)
2484     {
2485         CV_TRACE_FUNCTION();
2486
2487         CV_Assert(!device.empty());
2488         cl_device_id d = (cl_device_id)device.ptr();
2489         CV_Assert(d);
2490
2491         std::string configuration = cv::format("@dev-%p", (void*)d);
2492         Impl* impl = findContext(configuration);
2493         if (impl)
2494         {
2495             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2496             impl->addref();
2497             return impl;
2498         }
2499
2500         impl = new Impl(configuration);
2501         try
2502         {
2503             impl->createFromDevice(d);
2504             CV_Assert(impl->handle);
2505             return impl;
2506         }
2507         catch (...)
2508         {
2509             delete impl;
2510             throw;
2511         }
2512     }
2513
2514     void setDefault()
2515     {
2516         CV_TRACE_FUNCTION();
2517         cl_device_id d = selectOpenCLDevice();
2518
2519         if (d == NULL)
2520             return;
2521
2522         createFromDevice(d);
2523     }
2524
2525     void createFromDevice(cl_device_id d)
2526     {
2527         CV_TRACE_FUNCTION();
2528         CV_Assert(handle == NULL);
2529
2530         cl_platform_id pl = NULL;
2531         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2532
2533         cl_context_properties prop[] =
2534         {
2535             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2536             0
2537         };
2538
2539         // !!! in the current implementation force the number of devices to 1 !!!
2540         cl_uint nd = 1;
2541         cl_int status;
2542
2543         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2544         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2545
2546         bool ok = handle != 0 && status == CL_SUCCESS;
2547         if( ok )
2548         {
2549             devices.resize(nd);
2550             devices[0].set(d);
2551         }
2552         else
2553             handle = NULL;
2554     }
2555
2556     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2557
2558     void unloadProg(Program& prog)
2559     {
2560         cv::AutoLock lock(program_cache_mutex);
2561         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2562         {
2563               phash_t::iterator it = phash.find(*i);
2564               if (it != phash.end())
2565               {
2566                   if (it->second.ptr() == prog.ptr())
2567                   {
2568                       phash.erase(*i);
2569                       cacheList.erase(i);
2570                       return;
2571                   }
2572               }
2573         }
2574     }
2575
2576     std::string& getPrefixString()
2577     {
2578         if (prefix.empty())
2579         {
2580             cv::AutoLock lock(program_cache_mutex);
2581             if (prefix.empty())
2582             {
2583                 CV_Assert(!devices.empty());
2584                 const Device& d = devices[0];
2585                 int bits = d.addressBits();
2586                 if (bits > 0 && bits != 64)
2587                     prefix = cv::format("%d-bit--", bits);
2588                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2589                 // sanitize chars
2590                 for (size_t i = 0; i < prefix.size(); i++)
2591                 {
2592                     char c = prefix[i];
2593                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2594                     {
2595                         prefix[i] = '_';
2596                     }
2597                 }
2598             }
2599         }
2600         return prefix;
2601     }
2602
2603     std::string& getPrefixBase()
2604     {
2605         if (prefix_base.empty())
2606         {
2607             cv::AutoLock lock(program_cache_mutex);
2608             if (prefix_base.empty())
2609             {
2610                 const Device& d = devices[0];
2611                 int bits = d.addressBits();
2612                 if (bits > 0 && bits != 64)
2613                     prefix_base = cv::format("%d-bit--", bits);
2614                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2615                 // sanitize chars
2616                 for (size_t i = 0; i < prefix_base.size(); i++)
2617                 {
2618                     char c = prefix_base[i];
2619                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2620                     {
2621                         prefix_base[i] = '_';
2622                     }
2623                 }
2624             }
2625         }
2626         return prefix_base;
2627     }
2628
2629     IMPLEMENT_REFCOUNTABLE();
2630
2631     const int contextId;  // global unique ID
2632     const std::string configuration;
2633
2634     cl_context handle;
2635     std::vector<Device> devices;
2636
2637     std::string prefix;
2638     std::string prefix_base;
2639
2640     cv::Mutex program_cache_mutex;
2641     typedef std::map<std::string, Program> phash_t;
2642     phash_t phash;
2643     typedef std::list<cv::String> CacheList;
2644     CacheList cacheList;
2645
2646     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2647     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2648     OpenCLBufferPoolImpl& getBufferPool() const
2649     {
2650         _init_buffer_pools();
2651         CV_DbgAssert(bufferPool_);
2652         return *bufferPool_.get();
2653     }
2654     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2655     {
2656         _init_buffer_pools();
2657         CV_DbgAssert(bufferPoolHostPtr_);
2658         return *bufferPoolHostPtr_.get();
2659     }
2660
2661 #ifdef HAVE_OPENCL_SVM
2662     bool svmInitialized;
2663     bool svmAvailable;
2664     bool svmEnabled;
2665     svm::SVMCapabilities svmCapabilities;
2666     svm::SVMFunctions svmFunctions;
2667
2668     void svmInit()
2669     {
2670         CV_Assert(handle != NULL);
2671         const Device& device = devices[0];
2672         cl_device_svm_capabilities deviceCaps = 0;
2673         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2674         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2675         if (status != CL_SUCCESS)
2676         {
2677             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2678             goto noSVM;
2679         }
2680         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2681         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2682         svmCapabilities.value_ =
2683                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2684                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2685                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2686                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2687         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2688         if (svmCapabilities.value_ == 0)
2689         {
2690             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2691             goto noSVM;
2692         }
2693         try
2694         {
2695             // Try OpenCL 2.0
2696             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2697             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2698             if (!ptr)
2699             {
2700                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2701                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2702             }
2703             try
2704             {
2705                 bool error = false;
2706                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2707                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2708                 {
2709                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2710                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2711                 }
2712                 clFinish(q);
2713                 try
2714                 {
2715                     ((int*)ptr)[0] = 100;
2716                 }
2717                 catch (...)
2718                 {
2719                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2720                     error = true;
2721                 }
2722                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2723                 {
2724                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2725                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2726                 }
2727                 clFinish(q);
2728                 if (error)
2729                 {
2730                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2731                 }
2732             }
2733             catch (...)
2734             {
2735                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2736                 clSVMFree(handle, ptr);
2737                 throw;
2738             }
2739             clSVMFree(handle, ptr);
2740             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2741             svmFunctions.fn_clSVMFree = clSVMFree;
2742             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2743             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2744             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2745             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2746             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2747             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2748             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2749         }
2750         catch (...)
2751         {
2752             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2753             try
2754             {
2755                 // Try HSA extension
2756                 String extensions = device.extensions();
2757                 if (extensions.find("cl_amd_svm") == String::npos)
2758                 {
2759                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2760                     goto noSVM;
2761                 }
2762                 cl_platform_id p = NULL;
2763                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2764                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2765                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2766                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2767                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2768                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2769                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2770                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2771                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2772                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2773                 CV_Assert(svmFunctions.isValid());
2774             }
2775             catch (...)
2776             {
2777                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2778                 goto noSVM;
2779             }
2780         }
2781
2782         svmAvailable = true;
2783         svmEnabled = !svm::checkDisableSVM();
2784         svmInitialized = true;
2785         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2786         return;
2787     noSVM:
2788         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2789         svmAvailable = false;
2790         svmEnabled = false;
2791         svmCapabilities.value_ = 0;
2792         svmInitialized = true;
2793         svmFunctions.fn_clSVMAlloc = NULL;
2794         return;
2795     }
2796
2797     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2798
2799     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2800     {
2801         _init_buffer_pools();
2802         CV_DbgAssert(bufferPoolSVM_);
2803         return *bufferPoolSVM_.get();
2804     }
2805 #endif
2806
2807     friend class Program;
2808 };
2809
2810
2811 Context::Context()
2812 {
2813     p = 0;
2814 }
2815
2816 Context::~Context()
2817 {
2818     release();
2819 }
2820
2821 // deprecated
2822 Context::Context(int dtype)
2823 {
2824     p = 0;
2825     create(dtype);
2826 }
2827
2828 void Context::release()
2829 {
2830     if (p)
2831     {
2832         p->release();
2833         p = NULL;
2834     }
2835 }
2836
2837 bool Context::create()
2838 {
2839     release();
2840     if (!haveOpenCL())
2841         return false;
2842     p = Impl::findOrCreateContext(std::string());
2843     if (p && p->handle)
2844         return true;
2845     release();
2846     return false;
2847 }
2848
2849 // deprecated
2850 bool Context::create(int dtype)
2851 {
2852     if( !haveOpenCL() )
2853         return false;
2854     release();
2855     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2856     {
2857         p = Impl::findOrCreateContext("");
2858     }
2859     else if (dtype == CL_DEVICE_TYPE_GPU)
2860     {
2861         p = Impl::findOrCreateContext(":GPU:");
2862     }
2863     else if (dtype == CL_DEVICE_TYPE_CPU)
2864     {
2865         p = Impl::findOrCreateContext(":CPU:");
2866     }
2867     else
2868     {
2869         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2870     }
2871     if (p && !p->handle)
2872     {
2873         release();
2874     }
2875     return p != 0;
2876 }
2877
2878 Context::Context(const Context& c)
2879 {
2880     p = (Impl*)c.p;
2881     if(p)
2882         p->addref();
2883 }
2884
2885 Context& Context::operator = (const Context& c)
2886 {
2887     Impl* newp = (Impl*)c.p;
2888     if(newp)
2889         newp->addref();
2890     if(p)
2891         p->release();
2892     p = newp;
2893     return *this;
2894 }
2895
2896 void* Context::ptr() const
2897 {
2898     return p == NULL ? NULL : p->handle;
2899 }
2900
2901 size_t Context::ndevices() const
2902 {
2903     return p ? p->devices.size() : 0;
2904 }
2905
2906 Device& Context::device(size_t idx) const
2907 {
2908     static Device dummy;
2909     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2910 }
2911
2912 Context& Context::getDefault(bool initialize)
2913 {
2914     auto& c = OpenCLExecutionContext::getCurrent();
2915     if (!c.empty())
2916     {
2917         auto& ctx = c.getContext();
2918         return ctx;
2919     }
2920
2921     CV_UNUSED(initialize);
2922     static Context dummy;
2923     return dummy;
2924 }
2925
2926 Program Context::getProg(const ProgramSource& prog,
2927                          const String& buildopts, String& errmsg)
2928 {
2929     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2930 }
2931
2932 void Context::unloadProg(Program& prog)
2933 {
2934     if (p)
2935         p->unloadProg(prog);
2936 }
2937
2938 /* static */
2939 Context Context::fromHandle(void* context)
2940 {
2941     Context ctx;
2942     ctx.p = Impl::findOrCreateContext((cl_context)context);
2943     return ctx;
2944 }
2945
2946 /* static */
2947 Context Context::fromDevice(const ocl::Device& device)
2948 {
2949     Context ctx;
2950     ctx.p = Impl::findOrCreateContext(device);
2951     return ctx;
2952 }
2953
2954 /* static */
2955 Context Context::create(const std::string& configuration)
2956 {
2957     Context ctx;
2958     ctx.p = Impl::findOrCreateContext(configuration);
2959     return ctx;
2960 }
2961
2962 #ifdef HAVE_OPENCL_SVM
2963 bool Context::useSVM() const
2964 {
2965     Context::Impl* i = p;
2966     CV_Assert(i);
2967     if (!i->svmInitialized)
2968         i->svmInit();
2969     return i->svmEnabled;
2970 }
2971 void Context::setUseSVM(bool enabled)
2972 {
2973     Context::Impl* i = p;
2974     CV_Assert(i);
2975     if (!i->svmInitialized)
2976         i->svmInit();
2977     if (enabled && !i->svmAvailable)
2978     {
2979         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2980     }
2981     i->svmEnabled = enabled;
2982 }
2983 #else
2984 bool Context::useSVM() const { return false; }
2985 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2986 #endif
2987
2988 #ifdef HAVE_OPENCL_SVM
2989 namespace svm {
2990
2991 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2992 {
2993     Context::Impl* i = context.p;
2994     CV_Assert(i);
2995     if (!i->svmInitialized)
2996         i->svmInit();
2997     return i->svmCapabilities;
2998 }
2999
3000 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3001 {
3002     Context::Impl* i = context.p;
3003     CV_Assert(i);
3004     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3005     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3006     return &i->svmFunctions;
3007 }
3008
3009 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3010 {
3011     if (checkForceSVMUmatUsage())
3012         return true;
3013     if (checkDisableSVMUMatUsage())
3014         return false;
3015     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3016         return true;
3017     return false; // don't use SVM by default
3018 }
3019
3020 } // namespace cv::ocl::svm
3021 #endif // HAVE_OPENCL_SVM
3022
3023
3024 static void get_platform_name(cl_platform_id id, String& name)
3025 {
3026     // get platform name string length
3027     size_t sz = 0;
3028     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3029
3030     // get platform name string
3031     AutoBuffer<char> buf(sz + 1);
3032     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3033
3034     // just in case, ensure trailing zero for ASCIIZ string
3035     buf[sz] = 0;
3036
3037     name = buf.data();
3038 }
3039
3040 /*
3041 // Attaches OpenCL context to OpenCV
3042 */
3043 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3044 {
3045     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3046     ctx.bind();
3047 }
3048
3049 /* static */
3050 OpenCLExecutionContext OpenCLExecutionContext::create(
3051         const std::string& platformName, void* platformID, void* context, void* deviceID
3052 )
3053 {
3054     if (!haveOpenCL())
3055         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3056
3057     cl_uint cnt = 0;
3058     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3059
3060     if (cnt == 0)
3061         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3062
3063     std::vector<cl_platform_id> platforms(cnt);
3064
3065     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3066
3067     bool platformAvailable = false;
3068
3069     // check if external platformName contained in list of available platforms in OpenCV
3070     for (unsigned int i = 0; i < cnt; i++)
3071     {
3072         String availablePlatformName;
3073         get_platform_name(platforms[i], availablePlatformName);
3074         // external platform is found in the list of available platforms
3075         if (platformName == availablePlatformName)
3076         {
3077             platformAvailable = true;
3078             break;
3079         }
3080     }
3081
3082     if (!platformAvailable)
3083         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3084
3085     // check if platformID corresponds to platformName
3086     String actualPlatformName;
3087     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3088     if (platformName != actualPlatformName)
3089         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3090
3091     OpenCLExecutionContext ctx;
3092     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3093     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3094     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3095     return ctx;
3096 }
3097
3098 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3099 {
3100     // internal call, less checks
3101     cl_platform_id platformID = (cl_platform_id)_platform;
3102     cl_context context = (cl_context)_context;
3103     cl_device_id deviceID = (cl_device_id)_device;
3104
3105     std::string platformName = PlatformInfo(platformID).name();
3106
3107     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3108     CV_Assert(!clExecCtx.empty());
3109     ctx = clExecCtx.getContext();
3110 }
3111
3112 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3113
3114 struct Queue::Impl
3115 {
3116     inline void __init()
3117     {
3118         refcount = 1;
3119         handle = 0;
3120         isProfilingQueue_ = false;
3121     }
3122
3123     Impl(cl_command_queue q)
3124     {
3125         __init();
3126         handle = q;
3127
3128         cl_command_queue_properties props = 0;
3129         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3130         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3131     }
3132
3133     Impl(cl_command_queue q, bool isProfilingQueue)
3134     {
3135         __init();
3136         handle = q;
3137         isProfilingQueue_ = isProfilingQueue;
3138     }
3139
3140     Impl(const Context& c, const Device& d, bool withProfiling = false)
3141     {
3142         __init();
3143
3144         const Context* pc = &c;
3145         cl_context ch = (cl_context)pc->ptr();
3146         if( !ch )
3147         {
3148             pc = &Context::getDefault();
3149             ch = (cl_context)pc->ptr();
3150         }
3151         cl_device_id dh = (cl_device_id)d.ptr();
3152         if( !dh )
3153             dh = (cl_device_id)pc->device(0).ptr();
3154         cl_int retval = 0;
3155         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3156         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3157         isProfilingQueue_ = withProfiling;
3158     }
3159
3160     ~Impl()
3161     {
3162 #ifdef _WIN32
3163         if (!cv::__termination)
3164 #endif
3165         {
3166             if(handle)
3167             {
3168                 CV_OCL_DBG_CHECK(clFinish(handle));
3169                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3170                 handle = NULL;
3171             }
3172         }
3173     }
3174
3175     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3176     {
3177         if (isProfilingQueue_)
3178             return self;
3179
3180         if (profiling_queue_.ptr())
3181             return profiling_queue_;
3182
3183         cl_context ctx = 0;
3184         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3185
3186         cl_device_id device = 0;
3187         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3188
3189         cl_int result = CL_SUCCESS;
3190         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3191         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3192         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3193
3194         Queue queue;
3195         queue.p = new Impl(q, true);
3196         profiling_queue_ = queue;
3197
3198         return profiling_queue_;
3199     }
3200
3201     IMPLEMENT_REFCOUNTABLE();
3202
3203     cl_command_queue handle;
3204     bool isProfilingQueue_;
3205     cv::ocl::Queue profiling_queue_;
3206 };
3207
3208 Queue::Queue()
3209 {
3210     p = 0;
3211 }
3212
3213 Queue::Queue(const Context& c, const Device& d)
3214 {
3215     p = 0;
3216     create(c, d);
3217 }
3218
3219 Queue::Queue(const Queue& q)
3220 {
3221     p = q.p;
3222     if(p)
3223         p->addref();
3224 }
3225
3226 Queue& Queue::operator = (const Queue& q)
3227 {
3228     Impl* newp = (Impl*)q.p;
3229     if(newp)
3230         newp->addref();
3231     if(p)
3232         p->release();
3233     p = newp;
3234     return *this;
3235 }
3236
3237 Queue::~Queue()
3238 {
3239     if(p)
3240         p->release();
3241 }
3242
3243 bool Queue::create(const Context& c, const Device& d)
3244 {
3245     if(p)
3246         p->release();
3247     p = new Impl(c, d);
3248     return p->handle != 0;
3249 }
3250
3251 void Queue::finish()
3252 {
3253     if(p && p->handle)
3254     {
3255         CV_OCL_DBG_CHECK(clFinish(p->handle));
3256     }
3257 }
3258
3259 const Queue& Queue::getProfilingQueue() const
3260 {
3261     CV_Assert(p);
3262     return p->getProfilingQueue(*this);
3263 }
3264
3265 void* Queue::ptr() const
3266 {
3267     return p ? p->handle : 0;
3268 }
3269
3270 Queue& Queue::getDefault()
3271 {
3272     auto& c = OpenCLExecutionContext::getCurrent();
3273     if (!c.empty())
3274     {
3275         auto& q = c.getQueue();
3276         return q;
3277     }
3278     static Queue dummy;
3279     return dummy;
3280 }
3281
3282 static cl_command_queue getQueue(const Queue& q)
3283 {
3284     cl_command_queue qq = (cl_command_queue)q.ptr();
3285     if(!qq)
3286         qq = (cl_command_queue)Queue::getDefault().ptr();
3287     return qq;
3288 }
3289
3290 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3291
3292 KernelArg::KernelArg()
3293     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3294 {
3295 }
3296
3297 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3298     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3299 {
3300     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3301 }
3302
3303 KernelArg KernelArg::Constant(const Mat& m)
3304 {
3305     CV_Assert(m.isContinuous());
3306     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3307 }
3308
3309 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3310
3311 struct Kernel::Impl
3312 {
3313     Impl(const char* kname, const Program& prog) :
3314         refcount(1), handle(NULL), isInProgress(false), nu(0)
3315     {
3316         cl_program ph = (cl_program)prog.ptr();
3317         cl_int retval = 0;
3318         name = kname;
3319         if (ph)
3320         {
3321             handle = clCreateKernel(ph, kname, &retval);
3322             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3323         }
3324         for( int i = 0; i < MAX_ARRS; i++ )
3325             u[i] = 0;
3326         haveTempDstUMats = false;
3327         haveTempSrcUMats = false;
3328     }
3329
3330     void cleanupUMats()
3331     {
3332         for( int i = 0; i < MAX_ARRS; i++ )
3333             if( u[i] )
3334             {
3335                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3336                 {
3337                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3338                     u[i]->currAllocator->deallocate(u[i]);
3339                 }
3340                 u[i] = 0;
3341             }
3342         nu = 0;
3343         haveTempDstUMats = false;
3344         haveTempSrcUMats = false;
3345     }
3346
3347     void addUMat(const UMat& m, bool dst)
3348     {
3349         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3350         u[nu] = m.u;
3351         CV_XADD(&m.u->urefcount, 1);
3352         nu++;
3353         if(dst && m.u->tempUMat())
3354             haveTempDstUMats = true;
3355         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3356             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3357     }
3358
3359     void addImage(const Image2D& image)
3360     {
3361         images.push_back(image);
3362     }
3363
3364     void finit(cl_event e)
3365     {
3366         CV_UNUSED(e);
3367         cleanupUMats();
3368         images.clear();
3369         isInProgress = false;
3370         release();
3371     }
3372
3373     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3374             bool sync, int64* timeNS, const Queue& q);
3375
3376     ~Impl()
3377     {
3378         if(handle)
3379         {
3380             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3381         }
3382     }
3383
3384     IMPLEMENT_REFCOUNTABLE();
3385
3386     cv::String name;
3387     cl_kernel handle;
3388     enum { MAX_ARRS = 16 };
3389     UMatData* u[MAX_ARRS];
3390     bool isInProgress;
3391     int nu;
3392     std::list<Image2D> images;
3393     bool haveTempDstUMats;
3394     bool haveTempSrcUMats;
3395 };
3396
3397 }} // namespace cv::ocl
3398
3399 extern "C" {
3400
3401 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3402 {
3403     try
3404     {
3405         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3406     }
3407     catch (const cv::Exception& exc)
3408     {
3409         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3410     }
3411     catch (const std::exception& exc)
3412     {
3413         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3414     }
3415     catch (...)
3416     {
3417         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3418     }
3419 }
3420
3421 }
3422
3423 namespace cv { namespace ocl {
3424
3425 Kernel::Kernel()
3426 {
3427     p = 0;
3428 }
3429
3430 Kernel::Kernel(const char* kname, const Program& prog)
3431 {
3432     p = 0;
3433     create(kname, prog);
3434 }
3435
3436 Kernel::Kernel(const char* kname, const ProgramSource& src,
3437                const String& buildopts, String* errmsg)
3438 {
3439     p = 0;
3440     create(kname, src, buildopts, errmsg);
3441 }
3442
3443 Kernel::Kernel(const Kernel& k)
3444 {
3445     p = k.p;
3446     if(p)
3447         p->addref();
3448 }
3449
3450 Kernel& Kernel::operator = (const Kernel& k)
3451 {
3452     Impl* newp = (Impl*)k.p;
3453     if(newp)
3454         newp->addref();
3455     if(p)
3456         p->release();
3457     p = newp;
3458     return *this;
3459 }
3460
3461 Kernel::~Kernel()
3462 {
3463     if(p)
3464         p->release();
3465 }
3466
3467 bool Kernel::create(const char* kname, const Program& prog)
3468 {
3469     if(p)
3470         p->release();
3471     p = new Impl(kname, prog);
3472     if(p->handle == 0)
3473     {
3474         p->release();
3475         p = 0;
3476     }
3477 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3478     CV_Assert(p);
3479 #endif
3480     return p != 0;
3481 }
3482
3483 bool Kernel::create(const char* kname, const ProgramSource& src,
3484                     const String& buildopts, String* errmsg)
3485 {
3486     if(p)
3487     {
3488         p->release();
3489         p = 0;
3490     }
3491     String tempmsg;
3492     if( !errmsg ) errmsg = &tempmsg;
3493     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3494     return create(kname, prog);
3495 }
3496
3497 void* Kernel::ptr() const
3498 {
3499     return p ? p->handle : 0;
3500 }
3501
3502 bool Kernel::empty() const
3503 {
3504     return ptr() == 0;
3505 }
3506
3507 int Kernel::set(int i, const void* value, size_t sz)
3508 {
3509     if (!p || !p->handle)
3510         return -1;
3511     if (i < 0)
3512         return i;
3513     if( i == 0 )
3514         p->cleanupUMats();
3515
3516     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3517     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
3518     if (retval != CL_SUCCESS)
3519         return -1;
3520     return i+1;
3521 }
3522
3523 int Kernel::set(int i, const Image2D& image2D)
3524 {
3525     p->addImage(image2D);
3526     cl_mem h = (cl_mem)image2D.ptr();
3527     return set(i, &h, sizeof(h));
3528 }
3529
3530 int Kernel::set(int i, const UMat& m)
3531 {
3532     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3533 }
3534
3535 int Kernel::set(int i, const KernelArg& arg)
3536 {
3537     if( !p || !p->handle )
3538         return -1;
3539     if (i < 0)
3540     {
3541         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3542                 p->name.c_str(), (int)i));
3543         return i;
3544     }
3545     if( i == 0 )
3546         p->cleanupUMats();
3547     cl_int status = 0;
3548     if( arg.m )
3549     {
3550         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3551                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3552         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3553         if (ptronly && arg.m->empty())
3554         {
3555             cl_mem h_null = (cl_mem)NULL;
3556             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3557             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3558             return i + 1;
3559         }
3560         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3561
3562         if (!h)
3563         {
3564             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3565                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3566             p->release();
3567             p = 0;
3568             return -1;
3569         }
3570
3571 #ifdef HAVE_OPENCL_SVM
3572         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3573         {
3574             const Context& ctx = Context::getDefault();
3575             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3576             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3577             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3578 #if 1 // TODO
3579             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3580 #else
3581             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3582 #endif
3583             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3584         }
3585         else
3586 #endif
3587         {
3588             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3589             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3590         }
3591
3592         if (ptronly)
3593         {
3594             i++;
3595         }
3596         else if( arg.m->dims <= 2 )
3597         {
3598             UMat2D u2d(*arg.m);
3599             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3600             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3601             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3602             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3603             i += 3;
3604
3605             if( !(arg.flags & KernelArg::NO_SIZE) )
3606             {
3607                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3608                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3609                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3610                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3611                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3612                 i += 2;
3613             }
3614         }
3615         else
3616         {
3617             UMat3D u3d(*arg.m);
3618             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3619             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3620             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3621             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3622             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3623             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3624             i += 4;
3625             if( !(arg.flags & KernelArg::NO_SIZE) )
3626             {
3627                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3628                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3629                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3630                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3631                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3632                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3633                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3634                 i += 3;
3635             }
3636         }
3637         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3638         return i;
3639     }
3640     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3641     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3642     return i+1;
3643 }
3644
3645 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3646                  bool sync, const Queue& q)
3647 {
3648     if (!p)
3649         return false;
3650
3651     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3652     size_t total = 1;
3653     CV_Assert(_globalsize != NULL);
3654     for (int i = 0; i < dims; i++)
3655     {
3656         size_t val = _localsize ? _localsize[i] :
3657             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3658         CV_Assert( val > 0 );
3659         total *= _globalsize[i];
3660         if (_globalsize[i] == 1 && !_localsize)
3661             val = 1;
3662         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3663     }
3664     CV_Assert(total > 0);
3665
3666     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3667 }
3668
3669
3670 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3671         bool sync, int64* timeNS, const Queue& q)
3672 {
3673     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3674
3675     if (!handle || isInProgress)
3676         return false;
3677
3678     cl_command_queue qq = getQueue(q);
3679     if (haveTempDstUMats)
3680         sync = true;
3681     if (haveTempSrcUMats)
3682         sync = true;
3683     if (timeNS)
3684         sync = true;
3685     cl_event asyncEvent = 0;
3686     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3687                                            NULL, globalsize, localsize, 0, 0,
3688                                            (sync && !timeNS) ? 0 : &asyncEvent);
3689 #if !CV_OPENCL_SHOW_RUN_KERNELS
3690     if (retval != CL_SUCCESS)
3691 #endif
3692     {
3693         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3694                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3695                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3696                         sync ? "true" : "false"
3697                         );
3698         if (retval != CL_SUCCESS)
3699         {
3700             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3701         }
3702 #if CV_OPENCL_TRACE_CHECK
3703         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3704 #else
3705         printf("%s\n", msg.c_str());
3706         fflush(stdout);
3707 #endif
3708     }
3709     if (sync || retval != CL_SUCCESS)
3710     {
3711         CV_OCL_DBG_CHECK(clFinish(qq));
3712         if (timeNS)
3713         {
3714             if (retval == CL_SUCCESS)
3715             {
3716                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3717                 cl_ulong startTime, stopTime;
3718                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3719                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3720                 *timeNS = (int64)(stopTime - startTime);
3721             }
3722             else
3723             {
3724                 *timeNS = -1;
3725             }
3726         }
3727         cleanupUMats();
3728     }
3729     else
3730     {
3731         addref();
3732         isInProgress = true;
3733         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3734     }
3735     if (asyncEvent)
3736         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3737     return retval == CL_SUCCESS;
3738 }
3739
3740 bool Kernel::runTask(bool sync, const Queue& q)
3741 {
3742     if(!p || !p->handle || p->isInProgress)
3743         return false;
3744
3745     cl_command_queue qq = getQueue(q);
3746     cl_event asyncEvent = 0;
3747     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3748     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3749     if (sync || retval != CL_SUCCESS)
3750     {
3751         CV_OCL_DBG_CHECK(clFinish(qq));
3752         p->cleanupUMats();
3753     }
3754     else
3755     {
3756         p->addref();
3757         p->isInProgress = true;
3758         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3759     }
3760     if (asyncEvent)
3761         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3762     return retval == CL_SUCCESS;
3763 }
3764
3765 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3766 {
3767     CV_Assert(p && p->handle && !p->isInProgress);
3768     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3769     CV_Assert(q.ptr());
3770     q.finish(); // call clFinish() on base queue
3771     Queue profilingQueue = q.getProfilingQueue();
3772     int64 timeNs = -1;
3773     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3774     return res ? timeNs : -1;
3775 }
3776
3777 size_t Kernel::workGroupSize() const
3778 {
3779     if(!p || !p->handle)
3780         return 0;
3781     size_t val = 0, retsz = 0;
3782     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3783     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3784     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3785     return status == CL_SUCCESS ? val : 0;
3786 }
3787
3788 size_t Kernel::preferedWorkGroupSizeMultiple() const
3789 {
3790     if(!p || !p->handle)
3791         return 0;
3792     size_t val = 0, retsz = 0;
3793     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3794     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3795     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3796     return status == CL_SUCCESS ? val : 0;
3797 }
3798
3799 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3800 {
3801     if(!p || !p->handle || !wsz)
3802         return 0;
3803     size_t retsz = 0;
3804     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3805     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3806     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3807     return status == CL_SUCCESS;
3808 }
3809
3810 size_t Kernel::localMemSize() const
3811 {
3812     if(!p || !p->handle)
3813         return 0;
3814     size_t retsz = 0;
3815     cl_ulong val = 0;
3816     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3817     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3818     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3819     return status == CL_SUCCESS ? (size_t)val : 0;
3820 }
3821
3822
3823
3824 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3825
3826 struct ProgramSource::Impl
3827 {
3828     IMPLEMENT_REFCOUNTABLE();
3829
3830     enum KIND {
3831         PROGRAM_SOURCE_CODE = 0,
3832         PROGRAM_BINARIES,
3833         PROGRAM_SPIR,
3834         PROGRAM_SPIRV
3835     } kind_;
3836
3837     Impl(const String& src)
3838     {
3839         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3840         initFromSource(src, cv::String());
3841     }
3842     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3843     {
3844         init(PROGRAM_SOURCE_CODE, module, name);
3845         initFromSource(codeStr, codeHash);
3846     }
3847
3848     /// reset fields
3849     void init(enum KIND kind, const String& module, const String& name)
3850     {
3851         refcount = 1;
3852         kind_ = kind;
3853         module_ = module;
3854         name_ = name;
3855
3856         sourceAddr_ = NULL;
3857         sourceSize_ = 0;
3858         isHashUpdated = false;
3859     }
3860
3861     void initFromSource(const String& codeStr, const String& codeHash)
3862     {
3863         codeStr_ = codeStr;
3864         sourceHash_ = codeHash;
3865         if (sourceHash_.empty())
3866         {
3867             updateHash();
3868         }
3869         else
3870         {
3871             isHashUpdated = true;
3872         }
3873     }
3874
3875     void updateHash(const char* hashStr = NULL)
3876     {
3877         if (hashStr)
3878         {
3879             sourceHash_ = cv::String(hashStr);
3880             isHashUpdated = true;
3881             return;
3882         }
3883         uint64 hash = 0;
3884         switch (kind_)
3885         {
3886         case PROGRAM_SOURCE_CODE:
3887             if (sourceAddr_)
3888             {
3889                 CV_Assert(codeStr_.empty());
3890                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3891             }
3892             else
3893             {
3894                 CV_Assert(!codeStr_.empty());
3895                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3896             }
3897             break;
3898         case PROGRAM_BINARIES:
3899         case PROGRAM_SPIR:
3900         case PROGRAM_SPIRV:
3901             hash = crc64(sourceAddr_, sourceSize_);
3902             break;
3903         default:
3904             CV_Error(Error::StsInternal, "Internal error");
3905         }
3906         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3907         isHashUpdated = true;
3908     }
3909
3910     Impl(enum KIND kind,
3911             const String& module, const String& name,
3912             const unsigned char* binary, const size_t size,
3913             const cv::String& buildOptions = cv::String())
3914     {
3915         init(kind, module, name);
3916
3917         sourceAddr_ = binary;
3918         sourceSize_ = size;
3919
3920         buildOptions_ = buildOptions;
3921     }
3922
3923     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3924             const char* sourceCodeStaticStr, const char* hashStaticStr,
3925             const cv::String& buildOptions)
3926     {
3927         ProgramSource result;
3928         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3929                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3930         result.p->updateHash(hashStaticStr);
3931         return result;
3932     }
3933
3934     static ProgramSource fromBinary(const String& module, const String& name,
3935             const unsigned char* binary, const size_t size,
3936             const cv::String& buildOptions)
3937     {
3938         ProgramSource result;
3939         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3940         return result;
3941     }
3942
3943     static ProgramSource fromSPIR(const String& module, const String& name,
3944             const unsigned char* binary, const size_t size,
3945             const cv::String& buildOptions)
3946     {
3947         ProgramSource result;
3948         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3949         return result;
3950     }
3951
3952     String module_;
3953     String name_;
3954
3955     // TODO std::vector<ProgramSource> includes_;
3956     String codeStr_; // PROGRAM_SOURCE_CODE only
3957
3958     const unsigned char* sourceAddr_;
3959     size_t sourceSize_;
3960
3961     cv::String buildOptions_;
3962
3963     String sourceHash_;
3964     bool isHashUpdated;
3965
3966     friend struct Program::Impl;
3967     friend struct internal::ProgramEntry;
3968     friend struct Context::Impl;
3969 };
3970
3971
3972 ProgramSource::ProgramSource()
3973 {
3974     p = 0;
3975 }
3976
3977 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
3978 {
3979     p = new Impl(module, name, codeStr, codeHash);
3980 }
3981
3982 ProgramSource::ProgramSource(const char* prog)
3983 {
3984     p = new Impl(prog);
3985 }
3986
3987 ProgramSource::ProgramSource(const String& prog)
3988 {
3989     p = new Impl(prog);
3990 }
3991
3992 ProgramSource::~ProgramSource()
3993 {
3994     if(p)
3995         p->release();
3996 }
3997
3998 ProgramSource::ProgramSource(const ProgramSource& prog)
3999 {
4000     p = prog.p;
4001     if(p)
4002         p->addref();
4003 }
4004
4005 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4006 {
4007     Impl* newp = (Impl*)prog.p;
4008     if(newp)
4009         newp->addref();
4010     if(p)
4011         p->release();
4012     p = newp;
4013     return *this;
4014 }
4015
4016 const String& ProgramSource::source() const
4017 {
4018     CV_Assert(p);
4019     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4020     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4021     return p->codeStr_;
4022 }
4023
4024 ProgramSource::hash_t ProgramSource::hash() const
4025 {
4026     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4027 }
4028
4029 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4030         const unsigned char* binary, const size_t size,
4031         const cv::String& buildOptions)
4032 {
4033     CV_Assert(binary);
4034     CV_Assert(size > 0);
4035     return Impl::fromBinary(module, name, binary, size, buildOptions);
4036 }
4037
4038 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4039         const unsigned char* binary, const size_t size,
4040         const cv::String& buildOptions)
4041 {
4042     CV_Assert(binary);
4043     CV_Assert(size > 0);
4044     return Impl::fromBinary(module, name, binary, size, buildOptions);
4045 }
4046
4047
4048 internal::ProgramEntry::operator ProgramSource&() const
4049 {
4050     if (this->pProgramSource == NULL)
4051     {
4052         cv::AutoLock lock(cv::getInitializationMutex());
4053         if (this->pProgramSource == NULL)
4054         {
4055             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4056             ProgramSource* ptr = new ProgramSource(ps);
4057             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4058         }
4059     }
4060     return *this->pProgramSource;
4061 }
4062
4063
4064
4065 /////////////////////////////////////////// Program /////////////////////////////////////////////
4066
4067 static
4068 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4069 {
4070     if (b.empty())
4071         return a;
4072     if (a.empty())
4073         return b;
4074     if (b[0] == ' ')
4075         return a + b;
4076     return a + (cv::String(" ") + b);
4077 }
4078
4079 struct Program::Impl
4080 {
4081     IMPLEMENT_REFCOUNTABLE();
4082
4083     Impl(const ProgramSource& src,
4084          const String& _buildflags, String& errmsg) :
4085          refcount(1),
4086          handle(NULL),
4087          buildflags(_buildflags)
4088     {
4089         const ProgramSource::Impl* src_ = src.getImpl();
4090         CV_Assert(src_);
4091         sourceModule_ = src_->module_;
4092         sourceName_ = src_->name_;
4093         const Context ctx = Context::getDefault();
4094         Device device = ctx.device(0);
4095         if (ctx.ptr() == NULL || device.ptr() == NULL)
4096             return;
4097         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4098         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4099         {
4100             if (device.isAMD())
4101                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4102             else if (device.isIntel())
4103                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4104             const String param_buildExtraOptions = getBuildExtraOptions();
4105             if (!param_buildExtraOptions.empty())
4106                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4107         }
4108         compile(ctx, src_, errmsg);
4109     }
4110
4111     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4112     {
4113         CV_Assert(ctx.getImpl());
4114         CV_Assert(src_);
4115
4116         // We don't cache OpenCL binaries
4117         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4118         {
4119             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4120             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4121             return isLoaded;
4122         }
4123         return compileWithCache(ctx, src_, errmsg);
4124     }
4125
4126     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4127     {
4128         CV_Assert(ctx.getImpl());
4129         CV_Assert(src_);
4130         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4131
4132 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4133         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4134         const std::string base_dir = config.prepareCacheDirectoryForContext(
4135                 ctx.getImpl()->getPrefixString(),
4136                 ctx.getImpl()->getPrefixBase()
4137         );
4138         const String& hash_str = src_->sourceHash_;
4139         cv::String fname;
4140         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4141         {
4142             CV_Assert(!hash_str.empty());
4143             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4144             fname = utils::fs::join(base_dir, fname);
4145         }
4146         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4147         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4148         {
4149             try
4150             {
4151                 std::vector<char> binaryBuf;
4152                 bool res = false;
4153                 {
4154                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4155                     BinaryProgramFile file(fname, hash_str.c_str());
4156                     res = file.read(buildflags, binaryBuf);
4157                 }
4158                 if (res)
4159                 {
4160                     CV_Assert(!binaryBuf.empty());
4161                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4162                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4163                     if (isLoaded)
4164                         return true;
4165                 }
4166             }
4167             catch (const cv::Exception& e)
4168             {
4169                 CV_UNUSED(e);
4170                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4171             }
4172             catch (...)
4173             {
4174                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4175             }
4176         }
4177 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4178         CV_Assert(handle == NULL);
4179         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4180         {
4181             if (!buildFromSources(ctx, src_, errmsg))
4182             {
4183                 return false;
4184             }
4185         }
4186         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4187         {
4188             buildflags = joinBuildOptions(buildflags, " -x spir");
4189             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4190             {
4191                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4192             }
4193             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4194             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4195             if (!isLoaded)
4196                 return false;
4197         }
4198         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4199         {
4200             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4201         }
4202         else
4203         {
4204             CV_Error(Error::StsInternal, "Internal error");
4205         }
4206         CV_Assert(handle != NULL);
4207 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4208         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4209         {
4210             try
4211             {
4212                 std::vector<char> binaryBuf;
4213                 getProgramBinary(binaryBuf);
4214                 {
4215                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4216                     BinaryProgramFile file(fname, hash_str.c_str());
4217                     file.write(buildflags, binaryBuf);
4218                 }
4219             }
4220             catch (const cv::Exception& e)
4221             {
4222                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4223             }
4224             catch (...)
4225             {
4226                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4227             }
4228         }
4229 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4230 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4231         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4232         {
4233             std::vector<char> binaryBuf;
4234             getProgramBinary(binaryBuf);
4235             if (!binaryBuf.empty())
4236             {
4237                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4238                 handle = NULL;
4239                 createFromBinary(ctx, binaryBuf, errmsg);
4240             }
4241         }
4242 #endif
4243         return handle != NULL;
4244     }
4245
4246     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4247     {
4248         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4249
4250         size_t retsz = 0;
4251         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4252                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4253         if (log_retval == CL_SUCCESS && retsz > 1)
4254         {
4255             buffer.resize(retsz + 16);
4256             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4257                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4258             if (log_retval == CL_SUCCESS)
4259             {
4260                 if (retsz < buffer.size())
4261                     buffer[retsz] = 0;
4262                 else
4263                     buffer[buffer.size() - 1] = 0;
4264             }
4265             else
4266             {
4267                 buffer[0] = 0;
4268             }
4269         }
4270
4271         errmsg = String(buffer.data());
4272         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4273                 sourceModule_.c_str(), sourceName_.c_str(),
4274                 result, getOpenCLErrorString(result),
4275                 buildflags.c_str(), errmsg.c_str());
4276         fflush(stdout);
4277     }
4278
4279     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4280     {
4281         CV_Assert(src_);
4282         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4283         CV_Assert(handle == NULL);
4284         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4285                 sourceModule_.c_str(), sourceName_.c_str(),
4286                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4287
4288         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4289
4290         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4291         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4292         CV_Assert(srcptr != NULL);
4293         CV_Assert(srclen > 0);
4294
4295         cl_int retval = 0;
4296
4297         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4298         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4299         CV_Assert(handle || retval != CL_SUCCESS);
4300         if (handle && retval == CL_SUCCESS)
4301         {
4302             size_t n = ctx.ndevices();
4303             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4304             cl_device_id* deviceList = deviceListBuf.data();
4305             for (size_t i = 0; i < n; i++)
4306             {
4307                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4308             }
4309
4310             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4311             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4312 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4313             if (retval != CL_SUCCESS)
4314 #endif
4315             {
4316                 dumpBuildLog_(retval, deviceList, errmsg);
4317
4318                 // don't remove "retval != CL_SUCCESS" condition here:
4319                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4320                 if (retval != CL_SUCCESS && handle)
4321                 {
4322                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4323                     handle = NULL;
4324                 }
4325             }
4326 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4327             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4328             {
4329                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4330                 size_t retsz = 0;
4331                 char kernels_buffer[4096] = {0};
4332                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4333                 if (retsz < sizeof(kernels_buffer))
4334                     kernels_buffer[retsz] = 0;
4335                 else
4336                     kernels_buffer[0] = 0;
4337                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4338             }
4339 #endif
4340
4341         }
4342         return handle != NULL;
4343     }
4344
4345     void getProgramBinary(std::vector<char>& buf)
4346     {
4347         CV_Assert(handle);
4348         size_t sz = 0;
4349         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4350         buf.resize(sz);
4351         uchar* ptr = (uchar*)&buf[0];
4352         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4353     }
4354
4355     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4356     {
4357         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4358     }
4359
4360     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4361     {
4362         CV_Assert(handle == NULL);
4363         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4364         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4365
4366         CV_Assert(binarySize > 0);
4367
4368         size_t ndevices = (int)ctx.ndevices();
4369         AutoBuffer<cl_device_id> devices_(ndevices);
4370         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4371         AutoBuffer<size_t> binarySizes_(ndevices);
4372
4373         cl_device_id* devices = devices_.data();
4374         const uchar** binaryPtrs = binaryPtrs_.data();
4375         size_t* binarySizes = binarySizes_.data();
4376         for (size_t i = 0; i < ndevices; i++)
4377         {
4378             devices[i] = (cl_device_id)ctx.device(i).ptr();
4379             binaryPtrs[i] = binaryAddr;
4380             binarySizes[i] = binarySize;
4381         }
4382
4383         cl_int result = 0;
4384         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4385                                            binarySizes, binaryPtrs, NULL, &result);
4386         if (result != CL_SUCCESS)
4387         {
4388             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4389             if (handle)
4390             {
4391                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4392                 handle = NULL;
4393             }
4394         }
4395         if (!handle)
4396         {
4397             return false;
4398         }
4399         // call clBuildProgram()
4400         {
4401             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4402             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4403             if (result != CL_SUCCESS)
4404             {
4405                 dumpBuildLog_(result, devices, errmsg);
4406                 if (handle)
4407                 {
4408                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4409                     handle = NULL;
4410                 }
4411                 return false;
4412             }
4413         }
4414         // check build status
4415         {
4416             cl_build_status build_status = CL_BUILD_NONE;
4417             size_t retsz = 0;
4418             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4419                     sizeof(build_status), &build_status, &retsz));
4420             if (result == CL_SUCCESS)
4421             {
4422                 if (build_status == CL_BUILD_SUCCESS)
4423                 {
4424                     return true;
4425                 }
4426                 else
4427                 {
4428                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4429                     return false;
4430                 }
4431             }
4432             else
4433             {
4434                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4435                 if (handle)
4436                 {
4437                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4438                     handle = NULL;
4439                 }
4440             }
4441         }
4442 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4443         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4444         {
4445             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4446             size_t retsz = 0;
4447             char kernels_buffer[4096] = {0};
4448             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4449             if (retsz < sizeof(kernels_buffer))
4450                 kernels_buffer[retsz] = 0;
4451             else
4452                 kernels_buffer[0] = 0;
4453             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4454         }
4455 #endif
4456         return handle != NULL;
4457     }
4458
4459     ~Impl()
4460     {
4461         if( handle )
4462         {
4463 #ifdef _WIN32
4464             if (!cv::__termination)
4465 #endif
4466             {
4467                 clReleaseProgram(handle);
4468             }
4469             handle = NULL;
4470         }
4471     }
4472
4473     cl_program handle;
4474
4475     String buildflags;
4476     String sourceModule_;
4477     String sourceName_;
4478 };
4479
4480
4481 Program::Program() { p = 0; }
4482
4483 Program::Program(const ProgramSource& src,
4484         const String& buildflags, String& errmsg)
4485 {
4486     p = 0;
4487     create(src, buildflags, errmsg);
4488 }
4489
4490 Program::Program(const Program& prog)
4491 {
4492     p = prog.p;
4493     if(p)
4494         p->addref();
4495 }
4496
4497 Program& Program::operator = (const Program& prog)
4498 {
4499     Impl* newp = (Impl*)prog.p;
4500     if(newp)
4501         newp->addref();
4502     if(p)
4503         p->release();
4504     p = newp;
4505     return *this;
4506 }
4507
4508 Program::~Program()
4509 {
4510     if(p)
4511         p->release();
4512 }
4513
4514 bool Program::create(const ProgramSource& src,
4515             const String& buildflags, String& errmsg)
4516 {
4517     if(p)
4518     {
4519         p->release();
4520         p = NULL;
4521     }
4522     p = new Impl(src, buildflags, errmsg);
4523     if(!p->handle)
4524     {
4525         p->release();
4526         p = 0;
4527     }
4528     return p != 0;
4529 }
4530
4531 void* Program::ptr() const
4532 {
4533     return p ? p->handle : 0;
4534 }
4535
4536 #ifndef OPENCV_REMOVE_DEPRECATED_API
4537 const ProgramSource& Program::source() const
4538 {
4539     CV_Error(Error::StsNotImplemented, "Removed API");
4540 }
4541
4542 bool Program::read(const String& bin, const String& buildflags)
4543 {
4544     CV_UNUSED(bin); CV_UNUSED(buildflags);
4545     CV_Error(Error::StsNotImplemented, "Removed API");
4546 }
4547
4548 bool Program::write(String& bin) const
4549 {
4550     CV_UNUSED(bin);
4551     CV_Error(Error::StsNotImplemented, "Removed API");
4552 }
4553
4554 String Program::getPrefix() const
4555 {
4556     if(!p)
4557         return String();
4558     Context::Impl* ctx_ = Context::getDefault().getImpl();
4559     CV_Assert(ctx_);
4560     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4561 }
4562
4563 String Program::getPrefix(const String& buildflags)
4564 {
4565         Context::Impl* ctx_ = Context::getDefault().getImpl();
4566         CV_Assert(ctx_);
4567         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4568 }
4569 #endif // OPENCV_REMOVE_DEPRECATED_API
4570
4571 void Program::getBinary(std::vector<char>& binary) const
4572 {
4573     CV_Assert(p && "Empty program");
4574     p->getProgramBinary(binary);
4575 }
4576
4577 Program Context::Impl::getProg(const ProgramSource& src,
4578                                const String& buildflags, String& errmsg)
4579 {
4580     size_t limit = getProgramCountLimit();
4581     const ProgramSource::Impl* src_ = src.getImpl();
4582     CV_Assert(src_);
4583     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4584             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4585             getPrefixString().c_str(),
4586             buildflags.c_str());
4587     {
4588         cv::AutoLock lock(program_cache_mutex);
4589         phash_t::iterator it = phash.find(key);
4590         if (it != phash.end())
4591         {
4592             // TODO LRU cache
4593             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4594             if (i != cacheList.end() && i != cacheList.begin())
4595             {
4596                 cacheList.erase(i);
4597                 cacheList.push_front(key);
4598             }
4599             return it->second;
4600         }
4601         { // cleanup program cache
4602             size_t sz = phash.size();
4603             if (limit > 0 && sz >= limit)
4604             {
4605                 static bool warningFlag = false;
4606                 if (!warningFlag)
4607                 {
4608                     printf("\nWARNING: OpenCV-OpenCL:\n"
4609                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4610                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4611                     warningFlag = true;
4612                 }
4613                 while (!cacheList.empty())
4614                 {
4615                     size_t c = phash.erase(cacheList.back());
4616                     cacheList.pop_back();
4617                     if (c != 0)
4618                         break;
4619                 }
4620             }
4621         }
4622     }
4623     Program prog(src, buildflags, errmsg);
4624     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4625     {
4626         cv::AutoLock lock(program_cache_mutex);
4627         phash.insert(std::pair<std::string, Program>(key, prog));
4628         cacheList.push_front(key);
4629     }
4630     return prog;
4631 }
4632
4633
4634 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4635
4636 template<typename T>
4637 class OpenCLBufferPool
4638 {
4639 protected:
4640     ~OpenCLBufferPool() { }
4641 public:
4642     virtual T allocate(size_t size) = 0;
4643     virtual void release(T buffer) = 0;
4644 };
4645
4646 template <typename Derived, typename BufferEntry, typename T>
4647 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4648 {
4649 private:
4650     inline Derived& derived() { return *static_cast<Derived*>(this); }
4651 protected:
4652     Mutex mutex_;
4653
4654     size_t currentReservedSize;
4655     size_t maxReservedSize;
4656
4657     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4658     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4659
4660     // synchronized
4661     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4662     {
4663         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4664         for (; i != allocatedEntries_.end(); ++i)
4665         {
4666             BufferEntry& e = *i;
4667             if (e.clBuffer_ == buffer)
4668             {
4669                 entry = e;
4670                 allocatedEntries_.erase(i);
4671                 return true;
4672             }
4673         }
4674         return false;
4675     }
4676
4677     // synchronized
4678     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4679     {
4680         if (reservedEntries_.empty())
4681             return false;
4682         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4683         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4684         BufferEntry result;
4685         size_t minDiff = (size_t)(-1);
4686         for (; i != reservedEntries_.end(); ++i)
4687         {
4688             BufferEntry& e = *i;
4689             if (e.capacity_ >= size)
4690             {
4691                 size_t diff = e.capacity_ - size;
4692                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4693                 {
4694                     minDiff = diff;
4695                     result_pos = i;
4696                     result = e;
4697                     if (diff == 0)
4698                         break;
4699                 }
4700             }
4701         }
4702         if (result_pos != reservedEntries_.end())
4703         {
4704             //CV_DbgAssert(result == *result_pos);
4705             reservedEntries_.erase(result_pos);
4706             entry = result;
4707             currentReservedSize -= entry.capacity_;
4708             allocatedEntries_.push_back(entry);
4709             return true;
4710         }
4711         return false;
4712     }
4713
4714     // synchronized
4715     void _checkSizeOfReservedEntries()
4716     {
4717         while (currentReservedSize > maxReservedSize)
4718         {
4719             CV_DbgAssert(!reservedEntries_.empty());
4720             const BufferEntry& entry = reservedEntries_.back();
4721             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4722             currentReservedSize -= entry.capacity_;
4723             derived()._releaseBufferEntry(entry);
4724             reservedEntries_.pop_back();
4725         }
4726     }
4727
4728     inline size_t _allocationGranularity(size_t size)
4729     {
4730         // heuristic values
4731         if (size < 1024*1024)
4732             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4733         else if (size < 16*1024*1024)
4734             return 64*1024;
4735         else
4736             return 1024*1024;
4737     }
4738
4739 public:
4740     OpenCLBufferPoolBaseImpl()
4741         : currentReservedSize(0),
4742           maxReservedSize(0)
4743     {
4744         // nothing
4745     }
4746     virtual ~OpenCLBufferPoolBaseImpl()
4747     {
4748         freeAllReservedBuffers();
4749         CV_Assert(reservedEntries_.empty());
4750     }
4751 public:
4752     virtual T allocate(size_t size) CV_OVERRIDE
4753     {
4754         AutoLock locker(mutex_);
4755         BufferEntry entry;
4756         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4757         {
4758             CV_DbgAssert(size <= entry.capacity_);
4759             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4760         }
4761         else
4762         {
4763             derived()._allocateBufferEntry(entry, size);
4764         }
4765         return entry.clBuffer_;
4766     }
4767     virtual void release(T buffer) CV_OVERRIDE
4768     {
4769         AutoLock locker(mutex_);
4770         BufferEntry entry;
4771         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4772         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4773         {
4774             derived()._releaseBufferEntry(entry);
4775         }
4776         else
4777         {
4778             reservedEntries_.push_front(entry);
4779             currentReservedSize += entry.capacity_;
4780             _checkSizeOfReservedEntries();
4781         }
4782     }
4783
4784     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4785     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4786     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4787     {
4788         AutoLock locker(mutex_);
4789         size_t oldMaxReservedSize = maxReservedSize;
4790         maxReservedSize = size;
4791         if (maxReservedSize < oldMaxReservedSize)
4792         {
4793             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4794             for (; i != reservedEntries_.end();)
4795             {
4796                 const BufferEntry& entry = *i;
4797                 if (entry.capacity_ > maxReservedSize / 8)
4798                 {
4799                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4800                     currentReservedSize -= entry.capacity_;
4801                     derived()._releaseBufferEntry(entry);
4802                     i = reservedEntries_.erase(i);
4803                     continue;
4804                 }
4805                 ++i;
4806             }
4807             _checkSizeOfReservedEntries();
4808         }
4809     }
4810     virtual void freeAllReservedBuffers() CV_OVERRIDE
4811     {
4812         AutoLock locker(mutex_);
4813         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4814         for (; i != reservedEntries_.end(); ++i)
4815         {
4816             const BufferEntry& entry = *i;
4817             derived()._releaseBufferEntry(entry);
4818         }
4819         reservedEntries_.clear();
4820         currentReservedSize = 0;
4821     }
4822 };
4823
4824 struct CLBufferEntry
4825 {
4826     cl_mem clBuffer_;
4827     size_t capacity_;
4828     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4829 };
4830
4831 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4832 {
4833 public:
4834     typedef struct CLBufferEntry BufferEntry;
4835 protected:
4836     int createFlags_;
4837 public:
4838     OpenCLBufferPoolImpl(int createFlags = 0)
4839         : createFlags_(createFlags)
4840     {
4841     }
4842
4843     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4844     {
4845         CV_DbgAssert(entry.clBuffer_ == NULL);
4846         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4847         Context& ctx = Context::getDefault();
4848         cl_int retval = CL_SUCCESS;
4849         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4850         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4851         CV_Assert(entry.clBuffer_ != NULL);
4852         if(retval == CL_SUCCESS)
4853         {
4854             CV_IMPL_ADD(CV_IMPL_OCL);
4855         }
4856         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4857                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4858         allocatedEntries_.push_back(entry);
4859     }
4860
4861     void _releaseBufferEntry(const BufferEntry& entry)
4862     {
4863         CV_Assert(entry.capacity_ != 0);
4864         CV_Assert(entry.clBuffer_ != NULL);
4865         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4866                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4867         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4868     }
4869 };
4870
4871 #ifdef HAVE_OPENCL_SVM
4872 struct CLSVMBufferEntry
4873 {
4874     void* clBuffer_;
4875     size_t capacity_;
4876     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4877 };
4878 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4879 {
4880 public:
4881     typedef struct CLSVMBufferEntry BufferEntry;
4882 public:
4883     OpenCLSVMBufferPoolImpl()
4884     {
4885     }
4886
4887     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4888     {
4889         CV_DbgAssert(entry.clBuffer_ == NULL);
4890         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4891
4892         Context& ctx = Context::getDefault();
4893         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4894         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4895         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4896                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4897
4898         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4899         CV_DbgAssert(svmFns->isValid());
4900
4901         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4902         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4903         CV_Assert(buf);
4904
4905         entry.clBuffer_ = buf;
4906         {
4907             CV_IMPL_ADD(CV_IMPL_OCL);
4908         }
4909         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4910                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4911         allocatedEntries_.push_back(entry);
4912     }
4913
4914     void _releaseBufferEntry(const BufferEntry& entry)
4915     {
4916         CV_Assert(entry.capacity_ != 0);
4917         CV_Assert(entry.clBuffer_ != NULL);
4918         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4919                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4920         Context& ctx = Context::getDefault();
4921         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4922         CV_DbgAssert(svmFns->isValid());
4923         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4924         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4925     }
4926 };
4927 #endif
4928
4929
4930
4931 template <bool readAccess, bool writeAccess>
4932 class AlignedDataPtr
4933 {
4934 protected:
4935     const size_t size_;
4936     uchar* const originPtr_;
4937     const size_t alignment_;
4938     uchar* ptr_;
4939     uchar* allocatedPtr_;
4940
4941 public:
4942     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4943         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4944     {
4945         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4946         CV_DbgAssert(!readAccess || ptr);
4947         if (((size_t)ptr_ & (alignment - 1)) != 0)
4948         {
4949             allocatedPtr_ = new uchar[size_ + alignment - 1];
4950             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4951             if (readAccess)
4952             {
4953                 memcpy(ptr_, originPtr_, size_);
4954             }
4955         }
4956     }
4957
4958     uchar* getAlignedPtr() const
4959     {
4960         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4961         return ptr_;
4962     }
4963
4964     ~AlignedDataPtr()
4965     {
4966         if (allocatedPtr_)
4967         {
4968             if (writeAccess)
4969             {
4970                 memcpy(originPtr_, ptr_, size_);
4971             }
4972             delete[] allocatedPtr_;
4973             allocatedPtr_ = NULL;
4974         }
4975         ptr_ = NULL;
4976     }
4977 private:
4978     AlignedDataPtr(const AlignedDataPtr&); // disabled
4979     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4980 };
4981
4982 template <bool readAccess, bool writeAccess>
4983 class AlignedDataPtr2D
4984 {
4985 protected:
4986     const size_t size_;
4987     uchar* const originPtr_;
4988     const size_t alignment_;
4989     uchar* ptr_;
4990     uchar* allocatedPtr_;
4991     size_t rows_;
4992     size_t cols_;
4993     size_t step_;
4994
4995 public:
4996     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
4997         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
4998     {
4999         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5000         CV_DbgAssert(!readAccess || ptr != NULL);
5001         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5002         {
5003             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5004             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5005             if (readAccess)
5006             {
5007                 for (size_t i = 0; i < rows_; i++)
5008                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5009             }
5010         }
5011     }
5012
5013     uchar* getAlignedPtr() const
5014     {
5015         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5016         return ptr_;
5017     }
5018
5019     ~AlignedDataPtr2D()
5020     {
5021         if (allocatedPtr_)
5022         {
5023             if (writeAccess)
5024             {
5025                 for (size_t i = 0; i < rows_; i++)
5026                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5027             }
5028             delete[] allocatedPtr_;
5029             allocatedPtr_ = NULL;
5030         }
5031         ptr_ = NULL;
5032     }
5033 private:
5034     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5035     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5036 };
5037
5038 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5039 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5040 #endif
5041
5042
5043 void Context::Impl::__init_buffer_pools()
5044 {
5045     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5046     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5047     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5048     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5049
5050     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5051     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5052     bufferPool.setMaxReservedSize(poolSize);
5053     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5054     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5055
5056 #ifdef HAVE_OPENCL_SVM
5057     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5058     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5059     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5060     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5061 #endif
5062
5063     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5064 }
5065
5066 class OpenCLAllocator CV_FINAL : public MatAllocator
5067 {
5068 public:
5069     enum AllocatorFlags
5070     {
5071         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5072         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5073 #ifdef HAVE_OPENCL_SVM
5074         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5075 #endif
5076         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5077     };
5078
5079     OpenCLAllocator()
5080     {
5081         matStdAllocator = Mat::getDefaultAllocator();
5082     }
5083     ~OpenCLAllocator()
5084     {
5085         flushCleanupQueue();
5086     }
5087
5088     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5089             AccessFlag flags, UMatUsageFlags usageFlags) const
5090     {
5091         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5092         return u;
5093     }
5094
5095     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5096     {
5097         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5098         return value;
5099     }
5100     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5101     {
5102         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5103         return value;
5104     }
5105
5106     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5107     {
5108         const Device& dev = ctx.device(0);
5109         createFlags = 0;
5110         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5111             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5112
5113         if (!isOpenCLCopyingForced() &&
5114             (isOpenCLMapForced() ||
5115                 (dev.hostUnifiedMemory()
5116 #ifndef __APPLE__
5117                 || dev.isIntel()
5118 #endif
5119                 )
5120             )
5121         )
5122             flags0 = static_cast<UMatData::MemoryFlag>(0);
5123         else
5124             flags0 = UMatData::COPY_ON_MAP;
5125     }
5126
5127     UMatData* allocate(int dims, const int* sizes, int type,
5128                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5129     {
5130         if(!useOpenCL())
5131             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5132
5133         flushCleanupQueue();
5134
5135         CV_Assert(data == 0);
5136         size_t total = CV_ELEM_SIZE(type);
5137         for( int i = dims-1; i >= 0; i-- )
5138         {
5139             if( step )
5140                 step[i] = total;
5141             total *= sizes[i];
5142         }
5143
5144         Context& ctx = Context::getDefault();
5145         if (!ctx.getImpl())
5146             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5147         Context::Impl& ctxImpl = *ctx.getImpl();
5148
5149         int createFlags = 0;
5150         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5151         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5152
5153         void* handle = NULL;
5154         int allocatorFlags = 0;
5155
5156 #ifdef HAVE_OPENCL_SVM
5157         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5158         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5159         {
5160             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5161             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5162
5163             // this property is constant, so single buffer pool can be used here
5164             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5165             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5166         }
5167         else
5168 #endif
5169         if (createFlags == 0)
5170         {
5171             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5172             handle = ctxImpl.getBufferPool().allocate(total);
5173         }
5174         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5175         {
5176             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5177             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5178         }
5179         else
5180         {
5181             CV_Assert(handle != NULL); // Unsupported, throw
5182         }
5183
5184         if (!handle)
5185             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5186
5187         UMatData* u = new UMatData(this);
5188         u->data = 0;
5189         u->size = total;
5190         u->handle = handle;
5191         u->flags = flags0;
5192         u->allocatorFlags_ = allocatorFlags;
5193         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5194         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5195         u->markHostCopyObsolete(true);
5196         opencl_allocator_stats.onAllocate(u->size);
5197         return u;
5198     }
5199
5200     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5201     {
5202         if(!u)
5203             return false;
5204
5205         flushCleanupQueue();
5206
5207         UMatDataAutoLock lock(u);
5208
5209         if(u->handle == 0)
5210         {
5211             CV_Assert(u->origdata != 0);
5212             Context& ctx = Context::getDefault();
5213             int createFlags = 0;
5214             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5215             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5216
5217             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5218
5219             cl_context ctx_handle = (cl_context)ctx.ptr();
5220             int allocatorFlags = 0;
5221             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5222             void* handle = NULL;
5223             cl_int retval = CL_SUCCESS;
5224
5225 #ifdef HAVE_OPENCL_SVM
5226             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5227             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5228             if (useSVM && svmCaps.isSupportFineGrainSystem())
5229             {
5230                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5231                 tempUMatFlags = UMatData::TEMP_UMAT;
5232                 handle = u->origdata;
5233                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5234             }
5235             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5236             {
5237                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5238                 {
5239                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5240
5241                     cl_svm_mem_flags memFlags = createFlags |
5242                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5243
5244                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5245                     CV_DbgAssert(svmFns->isValid());
5246
5247                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5248                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5249                     CV_Assert(handle);
5250
5251                     cl_command_queue q = NULL;
5252                     if (!isFineGrainBuffer)
5253                     {
5254                         q = (cl_command_queue)Queue::getDefault().ptr();
5255                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5256                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5257                                 handle, u->size,
5258                                 0, NULL, NULL);
5259                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5260
5261                     }
5262                     memcpy(handle, u->origdata, u->size);
5263                     if (!isFineGrainBuffer)
5264                     {
5265                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5266                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5267                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5268                     }
5269
5270                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5271                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5272                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5273                 }
5274             }
5275             else
5276 #endif
5277             {
5278                 if( copyOnMap )
5279                     accessFlags &= ~ACCESS_FAST;
5280
5281                 tempUMatFlags = UMatData::TEMP_UMAT;
5282                 if (
5283                 #ifdef __APPLE__
5284                     !copyOnMap &&
5285                 #endif
5286                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5287                     // There are OpenCL runtime issues for less aligned data
5288                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5289                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5290                     // Avoid sharing of host memory between OpenCL buffers
5291                     && !(u->originalUMatData && u->originalUMatData->handle)
5292                 )
5293                 {
5294                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5295                                             u->size, u->origdata, &retval);
5296                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5297                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5298                 }
5299                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5300                 {
5301                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5302                                                u->size, u->origdata, &retval);
5303                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5304                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5305                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5306                 }
5307             }
5308             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5309             if(!handle || retval != CL_SUCCESS)
5310                 return false;
5311             u->handle = handle;
5312             u->prevAllocator = u->currAllocator;
5313             u->currAllocator = this;
5314             u->flags |= tempUMatFlags | flags0;
5315             u->allocatorFlags_ = allocatorFlags;
5316         }
5317         if (!!(accessFlags & ACCESS_WRITE))
5318             u->markHostCopyObsolete(true);
5319         opencl_allocator_stats.onAllocate(u->size);
5320         return true;
5321     }
5322
5323     /*void sync(UMatData* u) const
5324     {
5325         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5326         UMatDataAutoLock lock(u);
5327
5328         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5329         {
5330             if( u->tempCopiedUMat() )
5331             {
5332                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5333                                     u->size, u->origdata, 0, 0, 0);
5334             }
5335             else
5336             {
5337                 cl_int retval = 0;
5338                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5339                                                 (CL_MAP_READ | CL_MAP_WRITE),
5340                                                 0, u->size, 0, 0, 0, &retval);
5341                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5342                 clFinish(q);
5343             }
5344             u->markHostCopyObsolete(false);
5345         }
5346         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5347         {
5348             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5349                                  u->size, u->data, 0, 0, 0);
5350         }
5351     }*/
5352
5353     void deallocate(UMatData* u) const CV_OVERRIDE
5354     {
5355         if(!u)
5356             return;
5357
5358         CV_Assert(u->urefcount == 0);
5359         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5360
5361         CV_Assert(u->handle != 0);
5362         CV_Assert(u->mapcount == 0);
5363
5364         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5365             addToCleanupQueue(u);
5366         else
5367             deallocate_(u);
5368     }
5369
5370     void deallocate_(UMatData* u) const
5371     {
5372         CV_Assert(u);
5373         CV_Assert(u->handle);
5374         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5375         {
5376             opencl_allocator_stats.onFree(u->size);
5377         }
5378
5379 #ifdef _WIN32
5380         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5381             return;             // avoid any OpenCL calls
5382 #endif
5383         if(u->tempUMat())
5384         {
5385             CV_Assert(u->origdata);
5386 //            UMatDataAutoLock lock(u);
5387
5388             if (u->hostCopyObsolete())
5389             {
5390 #ifdef HAVE_OPENCL_SVM
5391                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5392                 {
5393                     Context& ctx = Context::getDefault();
5394                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5395                     CV_DbgAssert(svmFns->isValid());
5396
5397                     if( u->tempCopiedUMat() )
5398                     {
5399                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5400                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5401                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5402                         cl_command_queue q = NULL;
5403                         if (!isFineGrainBuffer)
5404                         {
5405                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5406                             q = (cl_command_queue)Queue::getDefault().ptr();
5407                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5408                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5409                                     u->handle, u->size,
5410                                     0, NULL, NULL);
5411                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5412                         }
5413                         clFinish(q);
5414                         memcpy(u->origdata, u->handle, u->size);
5415                         if (!isFineGrainBuffer)
5416                         {
5417                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5418                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5419                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5420                         }
5421                     }
5422                     else
5423                     {
5424                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5425                         // nothing
5426                     }
5427                 }
5428                 else
5429 #endif
5430                 {
5431                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5432                     if( u->tempCopiedUMat() )
5433                     {
5434                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5435                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5436                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5437                     }
5438                     else
5439                     {
5440                         cl_int retval = 0;
5441                         if (u->tempUMat())
5442                         {
5443                             CV_Assert(u->mapcount == 0);
5444                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5445                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5446                                 (CL_MAP_READ | CL_MAP_WRITE),
5447                                 0, u->size, 0, 0, 0, &retval);
5448                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5449                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5450                             if (u->originalUMatData)
5451                             {
5452                                 CV_Assert(u->originalUMatData->data == data);
5453                             }
5454                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5455                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5456                             CV_OCL_DBG_CHECK(clFinish(q));
5457                         }
5458                     }
5459                 }
5460                 u->markHostCopyObsolete(false);
5461             }
5462             else
5463             {
5464                 // nothing
5465             }
5466 #ifdef HAVE_OPENCL_SVM
5467             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5468             {
5469                 if( u->tempCopiedUMat() )
5470                 {
5471                     Context& ctx = Context::getDefault();
5472                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5473                     CV_DbgAssert(svmFns->isValid());
5474
5475                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5476                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5477                 }
5478             }
5479             else
5480 #endif
5481             {
5482                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5483                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5484             }
5485             u->handle = 0;
5486             u->markDeviceCopyObsolete(true);
5487             u->currAllocator = u->prevAllocator;
5488             u->prevAllocator = NULL;
5489             if(u->data && u->copyOnMap() && u->data != u->origdata)
5490                 fastFree(u->data);
5491             u->data = u->origdata;
5492             u->currAllocator->deallocate(u);
5493             u = NULL;
5494         }
5495         else
5496         {
5497             CV_Assert(u->origdata == NULL);
5498             if(u->data && u->copyOnMap() && u->data != u->origdata)
5499             {
5500                 fastFree(u->data);
5501                 u->data = 0;
5502                 u->markHostCopyObsolete(true);
5503             }
5504             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5505             {
5506                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5507                 CV_Assert(pCtx);
5508                 ocl::Context& ctx = *pCtx.get();
5509                 CV_Assert(ctx.getImpl());
5510                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5511             }
5512             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5513             {
5514                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5515                 CV_Assert(pCtx);
5516                 ocl::Context& ctx = *pCtx.get();
5517                 CV_Assert(ctx.getImpl());
5518                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5519             }
5520 #ifdef HAVE_OPENCL_SVM
5521             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5522             {
5523                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5524                 CV_Assert(pCtx);
5525                 ocl::Context& ctx = *pCtx.get();
5526                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5527                 {
5528                     //nothing
5529                 }
5530                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5531                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5532                 {
5533                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5534                     CV_DbgAssert(svmFns->isValid());
5535                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5536
5537                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5538                     {
5539                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5540                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5541                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5542                     }
5543                 }
5544                 CV_Assert(ctx.getImpl());
5545                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5546             }
5547 #endif
5548             else
5549             {
5550                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5551             }
5552             u->handle = 0;
5553             u->markDeviceCopyObsolete(true);
5554             delete u;
5555             u = NULL;
5556         }
5557         CV_Assert(u == NULL);
5558     }
5559
5560     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5561     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5562     {
5563         CV_Assert(u && u->handle);
5564
5565         if (!!(accessFlags & ACCESS_WRITE))
5566             u->markDeviceCopyObsolete(true);
5567
5568         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5569
5570         {
5571             if( !u->copyOnMap() )
5572             {
5573                 // TODO
5574                 // because there can be other map requests for the same UMat with different access flags,
5575                 // we use the universal (read-write) access mode.
5576 #ifdef HAVE_OPENCL_SVM
5577                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5578                 {
5579                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5580                     {
5581                         Context& ctx = Context::getDefault();
5582                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5583                         CV_DbgAssert(svmFns->isValid());
5584
5585                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5586                         {
5587                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5588                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5589                                     u->handle, u->size,
5590                                     0, NULL, NULL);
5591                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5592                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5593                         }
5594                     }
5595                     clFinish(q);
5596                     u->data = (uchar*)u->handle;
5597                     u->markHostCopyObsolete(false);
5598                     u->markDeviceMemMapped(true);
5599                     return;
5600                 }
5601 #endif
5602
5603                 cl_int retval = CL_SUCCESS;
5604                 if (!u->deviceMemMapped())
5605                 {
5606                     CV_Assert(u->refcount == 1);
5607                     CV_Assert(u->mapcount++ == 0);
5608                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5609                                                          (CL_MAP_READ | CL_MAP_WRITE),
5610                                                          0, u->size, 0, 0, 0, &retval);
5611                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5612                 }
5613                 if (u->data && retval == CL_SUCCESS)
5614                 {
5615                     u->markHostCopyObsolete(false);
5616                     u->markDeviceMemMapped(true);
5617                     return;
5618                 }
5619
5620                 // TODO Is it really a good idea and was it tested well?
5621                 // if map failed, switch to copy-on-map mode for the particular buffer
5622                 u->flags |= UMatData::COPY_ON_MAP;
5623             }
5624
5625             if(!u->data)
5626             {
5627                 u->data = (uchar*)fastMalloc(u->size);
5628                 u->markHostCopyObsolete(true);
5629             }
5630         }
5631
5632         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5633         {
5634             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5635 #ifdef HAVE_OPENCL_SVM
5636             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5637 #endif
5638             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5639                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5640             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5641                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5642             u->markHostCopyObsolete(false);
5643         }
5644     }
5645
5646     void unmap(UMatData* u) const CV_OVERRIDE
5647     {
5648         if(!u)
5649             return;
5650
5651
5652         CV_Assert(u->handle != 0);
5653
5654         UMatDataAutoLock autolock(u);
5655
5656         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5657         cl_int retval = 0;
5658         if( !u->copyOnMap() && u->deviceMemMapped() )
5659         {
5660             CV_Assert(u->data != NULL);
5661 #ifdef HAVE_OPENCL_SVM
5662             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5663             {
5664                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5665                 {
5666                     Context& ctx = Context::getDefault();
5667                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5668                     CV_DbgAssert(svmFns->isValid());
5669
5670                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5671                     {
5672                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5673                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5674                                 0, NULL, NULL);
5675                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5676                         clFinish(q);
5677                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5678                     }
5679                 }
5680                 if (u->refcount == 0)
5681                     u->data = 0;
5682                 u->markDeviceCopyObsolete(false);
5683                 u->markHostCopyObsolete(true);
5684                 return;
5685             }
5686 #endif
5687             if (u->refcount == 0)
5688             {
5689                 CV_Assert(u->mapcount-- == 1);
5690                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5691                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5692                 if (Device::getDefault().isAMD())
5693                 {
5694                     // required for multithreaded applications (see stitching test)
5695                     CV_OCL_DBG_CHECK(clFinish(q));
5696                 }
5697                 u->markDeviceMemMapped(false);
5698                 u->data = 0;
5699                 u->markDeviceCopyObsolete(false);
5700                 u->markHostCopyObsolete(true);
5701             }
5702         }
5703         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5704         {
5705             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5706 #ifdef HAVE_OPENCL_SVM
5707             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5708 #endif
5709             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5710                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5711             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5712                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5713             u->markDeviceCopyObsolete(false);
5714             u->markHostCopyObsolete(true);
5715         }
5716     }
5717
5718     bool checkContinuous(int dims, const size_t sz[],
5719                          const size_t srcofs[], const size_t srcstep[],
5720                          const size_t dstofs[], const size_t dststep[],
5721                          size_t& total, size_t new_sz[],
5722                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5723                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5724     {
5725         bool iscontinuous = true;
5726         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5727         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5728         total = sz[dims-1];
5729         for( int i = dims-2; i >= 0; i-- )
5730         {
5731             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5732                 iscontinuous = false;
5733             total *= sz[i];
5734             if( srcofs )
5735                 srcrawofs += srcofs[i]*srcstep[i];
5736             if( dstofs )
5737                 dstrawofs += dstofs[i]*dststep[i];
5738         }
5739
5740         if( !iscontinuous )
5741         {
5742             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5743             if( dims == 2 )
5744             {
5745                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5746                 // we assume that new_... arrays are initialized by caller
5747                 // with 0's, so there is no else branch
5748                 if( srcofs )
5749                 {
5750                     new_srcofs[0] = srcofs[1];
5751                     new_srcofs[1] = srcofs[0];
5752                     new_srcofs[2] = 0;
5753                 }
5754
5755                 if( dstofs )
5756                 {
5757                     new_dstofs[0] = dstofs[1];
5758                     new_dstofs[1] = dstofs[0];
5759                     new_dstofs[2] = 0;
5760                 }
5761
5762                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5763                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5764             }
5765             else
5766             {
5767                 // we could check for dims == 3 here,
5768                 // but from user perspective this one is more informative
5769                 CV_Assert(dims <= 3);
5770                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5771                 if( srcofs )
5772                 {
5773                     new_srcofs[0] = srcofs[2];
5774                     new_srcofs[1] = srcofs[1];
5775                     new_srcofs[2] = srcofs[0];
5776                 }
5777
5778                 if( dstofs )
5779                 {
5780                     new_dstofs[0] = dstofs[2];
5781                     new_dstofs[1] = dstofs[1];
5782                     new_dstofs[2] = dstofs[0];
5783                 }
5784
5785                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5786                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5787             }
5788         }
5789         return iscontinuous;
5790     }
5791
5792     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5793                   const size_t srcofs[], const size_t srcstep[],
5794                   const size_t dststep[]) const CV_OVERRIDE
5795     {
5796         if(!u)
5797             return;
5798         UMatDataAutoLock autolock(u);
5799
5800         if( u->data && !u->hostCopyObsolete() )
5801         {
5802             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5803             return;
5804         }
5805         CV_Assert( u->handle != 0 );
5806
5807         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5808
5809         size_t total = 0, new_sz[] = {0, 0, 0};
5810         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5811         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5812
5813         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5814                                             total, new_sz,
5815                                             srcrawofs, new_srcofs, new_srcstep,
5816                                             dstrawofs, new_dstofs, new_dststep);
5817
5818 #ifdef HAVE_OPENCL_SVM
5819         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5820         {
5821             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5822             Context& ctx = Context::getDefault();
5823             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5824             CV_DbgAssert(svmFns->isValid());
5825
5826             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5827             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5828             {
5829                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5830                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5831                         u->handle, u->size,
5832                         0, NULL, NULL);
5833                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5834             }
5835             clFinish(q);
5836             if( iscontinuous )
5837             {
5838                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5839             }
5840             else
5841             {
5842                 // This code is from MatAllocator::download()
5843                 int isz[CV_MAX_DIM];
5844                 uchar* srcptr = (uchar*)u->handle;
5845                 for( int i = 0; i < dims; i++ )
5846                 {
5847                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5848                     if( sz[i] == 0 )
5849                     return;
5850                     if( srcofs )
5851                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5852                     isz[i] = (int)sz[i];
5853                 }
5854
5855                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5856                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5857
5858                 const Mat* arrays[] = { &src, &dst };
5859                 uchar* ptrs[2];
5860                 NAryMatIterator it(arrays, ptrs, 2);
5861                 size_t j, planesz = it.size;
5862
5863                 for( j = 0; j < it.nplanes; j++, ++it )
5864                     memcpy(ptrs[1], ptrs[0], planesz);
5865             }
5866             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5867             {
5868                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5869                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5870                         0, NULL, NULL);
5871                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5872                 clFinish(q);
5873             }
5874         }
5875         else
5876 #endif
5877         {
5878             if( iscontinuous )
5879             {
5880                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5881                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5882                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5883             }
5884             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5885             {
5886                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5887                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5888                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5889                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5890                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5891                 uchar* ptr = alignedPtr.getAlignedPtr();
5892
5893                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5894                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5895                 total = std::min(total, u->size - new_srcrawofs);
5896                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5897                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5898                 for( size_t i = 0; i < new_sz[1]; i++ )
5899                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5900             }
5901             else
5902             {
5903                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5904                 uchar* ptr = alignedPtr.getAlignedPtr();
5905
5906                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5907                     new_srcofs, new_dstofs, new_sz,
5908                     new_srcstep[0], 0,
5909                     new_dststep[0], 0,
5910                     ptr, 0, 0, 0));
5911             }
5912         }
5913     }
5914
5915     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5916                 const size_t dstofs[], const size_t dststep[],
5917                 const size_t srcstep[]) const CV_OVERRIDE
5918     {
5919         if(!u)
5920             return;
5921
5922         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5923         CV_Assert(u->refcount == 0 || u->tempUMat());
5924
5925         size_t total = 0, new_sz[] = {0, 0, 0};
5926         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5927         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5928
5929         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5930                                             total, new_sz,
5931                                             srcrawofs, new_srcofs, new_srcstep,
5932                                             dstrawofs, new_dstofs, new_dststep);
5933
5934         UMatDataAutoLock autolock(u);
5935
5936         // if there is cached CPU copy of the GPU matrix,
5937         // we could use it as a destination.
5938         // we can do it in 2 cases:
5939         //    1. we overwrite the whole content
5940         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5941         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5942         {
5943             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5944             u->markHostCopyObsolete(false);
5945             u->markDeviceCopyObsolete(true);
5946             return;
5947         }
5948
5949         CV_Assert( u->handle != 0 );
5950         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5951
5952 #ifdef HAVE_OPENCL_SVM
5953         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5954         {
5955             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5956             Context& ctx = Context::getDefault();
5957             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5958             CV_DbgAssert(svmFns->isValid());
5959
5960             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5961             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5962             {
5963                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5964                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5965                         u->handle, u->size,
5966                         0, NULL, NULL);
5967                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5968             }
5969             clFinish(q);
5970             if( iscontinuous )
5971             {
5972                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
5973             }
5974             else
5975             {
5976                 // This code is from MatAllocator::upload()
5977                 int isz[CV_MAX_DIM];
5978                 uchar* dstptr = (uchar*)u->handle;
5979                 for( int i = 0; i < dims; i++ )
5980                 {
5981                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5982                     if( sz[i] == 0 )
5983                     return;
5984                     if( dstofs )
5985                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5986                     isz[i] = (int)sz[i];
5987                 }
5988
5989                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
5990                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5991
5992                 const Mat* arrays[] = { &src, &dst };
5993                 uchar* ptrs[2];
5994                 NAryMatIterator it(arrays, ptrs, 2);
5995                 size_t j, planesz = it.size;
5996
5997                 for( j = 0; j < it.nplanes; j++, ++it )
5998                     memcpy(ptrs[1], ptrs[0], planesz);
5999             }
6000             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6001             {
6002                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6003                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6004                         0, NULL, NULL);
6005                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6006                 clFinish(q);
6007             }
6008         }
6009         else
6010 #endif
6011         {
6012             if( iscontinuous )
6013             {
6014                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6015                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6016                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6017                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6018                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6019             }
6020             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6021             {
6022                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6023                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6024                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6025                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6026                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6027                 uchar* ptr = alignedPtr.getAlignedPtr();
6028
6029                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6030                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6031                 total = std::min(total, u->size - new_dstrawofs);
6032                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6033                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6034                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6035                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6036                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6037                 for( size_t i = 0; i < new_sz[1]; i++ )
6038                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6039                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6040                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6041             }
6042             else
6043             {
6044                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6045                 uchar* ptr = alignedPtr.getAlignedPtr();
6046
6047                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6048                     new_dstofs, new_srcofs, new_sz,
6049                     new_dststep[0], 0,
6050                     new_srcstep[0], 0,
6051                     ptr, 0, 0, 0));
6052             }
6053         }
6054         u->markHostCopyObsolete(true);
6055 #ifdef HAVE_OPENCL_SVM
6056         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6057                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6058         {
6059             // nothing
6060         }
6061         else
6062 #endif
6063         {
6064             u->markHostCopyObsolete(true);
6065         }
6066         u->markDeviceCopyObsolete(false);
6067     }
6068
6069     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6070               const size_t srcofs[], const size_t srcstep[],
6071               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6072     {
6073         if(!src || !dst)
6074             return;
6075
6076         size_t total = 0, new_sz[] = {0, 0, 0};
6077         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6078         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6079
6080         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6081                                             total, new_sz,
6082                                             srcrawofs, new_srcofs, new_srcstep,
6083                                             dstrawofs, new_dstofs, new_dststep);
6084
6085         UMatDataAutoLock src_autolock(src, dst);
6086
6087         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6088         {
6089             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6090             return;
6091         }
6092         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6093         {
6094             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6095             dst->markHostCopyObsolete(false);
6096 #ifdef HAVE_OPENCL_SVM
6097             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6098                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6099             {
6100                 // nothing
6101             }
6102             else
6103 #endif
6104             {
6105                 dst->markDeviceCopyObsolete(true);
6106             }
6107             return;
6108         }
6109
6110         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6111         CV_Assert(dst->refcount == 0);
6112         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6113
6114         cl_int retval = CL_SUCCESS;
6115 #ifdef HAVE_OPENCL_SVM
6116         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6117                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6118         {
6119             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6120                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6121             {
6122                 Context& ctx = Context::getDefault();
6123                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6124                 CV_DbgAssert(svmFns->isValid());
6125
6126                 if( iscontinuous )
6127                 {
6128                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6129                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6130                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6131                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6132                             total, 0, NULL, NULL);
6133                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6134                 }
6135                 else
6136                 {
6137                     clFinish(q);
6138                     // This code is from MatAllocator::download()/upload()
6139                     int isz[CV_MAX_DIM];
6140                     uchar* srcptr = (uchar*)src->handle;
6141                     for( int i = 0; i < dims; i++ )
6142                     {
6143                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6144                         if( sz[i] == 0 )
6145                         return;
6146                         if( srcofs )
6147                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6148                         isz[i] = (int)sz[i];
6149                     }
6150                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6151
6152                     uchar* dstptr = (uchar*)dst->handle;
6153                     for( int i = 0; i < dims; i++ )
6154                     {
6155                         if( dstofs )
6156                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6157                     }
6158                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6159
6160                     const Mat* arrays[] = { &m_src, &m_dst };
6161                     uchar* ptrs[2];
6162                     NAryMatIterator it(arrays, ptrs, 2);
6163                     size_t j, planesz = it.size;
6164
6165                     for( j = 0; j < it.nplanes; j++, ++it )
6166                         memcpy(ptrs[1], ptrs[0], planesz);
6167                 }
6168             }
6169             else
6170             {
6171                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6172                 {
6173                     map(src, ACCESS_READ);
6174                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6175                     unmap(src);
6176                 }
6177                 else
6178                 {
6179                     map(dst, ACCESS_WRITE);
6180                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6181                     unmap(dst);
6182                 }
6183             }
6184         }
6185         else
6186 #endif
6187         {
6188             if( iscontinuous )
6189             {
6190                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6191                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6192                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6193                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6194             }
6195             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6196             {
6197                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6198                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6199                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6200                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6201                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6202
6203                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6204                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6205                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6206                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6207                 uchar* srcptr = srcBuf.getAlignedPtr();
6208                 uchar* dstptr = dstBuf.getAlignedPtr();
6209
6210                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6211
6212                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6213                 src_total = std::min(src_total, src->size - new_srcrawofs);
6214                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6215                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6216
6217                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6218                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6219                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6220                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6221
6222                 for( size_t i = 0; i < new_sz[1]; i++ )
6223                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6224                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6225                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6226                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6227             }
6228             else
6229             {
6230                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6231                                                    new_srcofs, new_dstofs, new_sz,
6232                                                    new_srcstep[0], 0,
6233                                                    new_dststep[0], 0,
6234                                                    0, 0, 0));
6235             }
6236         }
6237         if (retval == CL_SUCCESS)
6238         {
6239             CV_IMPL_ADD(CV_IMPL_OCL)
6240         }
6241
6242 #ifdef HAVE_OPENCL_SVM
6243         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6244             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6245         {
6246             // nothing
6247         }
6248         else
6249 #endif
6250         {
6251             dst->markHostCopyObsolete(true);
6252         }
6253         dst->markDeviceCopyObsolete(false);
6254
6255         if( _sync )
6256         {
6257             CV_OCL_DBG_CHECK(clFinish(q));
6258         }
6259     }
6260
6261     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6262     {
6263         ocl::Context ctx = Context::getDefault();
6264         if (ctx.empty())
6265             return NULL;
6266 #ifdef HAVE_OPENCL_SVM
6267         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6268         {
6269             return &ctx.getImpl()->getBufferPoolSVM();
6270         }
6271 #endif
6272         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6273         {
6274             return &ctx.getImpl()->getBufferPoolHostPtr();
6275         }
6276         if (id != NULL && strcmp(id, "OCL") != 0)
6277         {
6278             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6279         }
6280         return &ctx.getImpl()->getBufferPool();
6281     }
6282
6283     MatAllocator* matStdAllocator;
6284
6285     mutable cv::Mutex cleanupQueueMutex;
6286     mutable std::deque<UMatData*> cleanupQueue;
6287
6288     void flushCleanupQueue() const
6289     {
6290         if (!cleanupQueue.empty())
6291         {
6292             std::deque<UMatData*> q;
6293             {
6294                 cv::AutoLock lock(cleanupQueueMutex);
6295                 q.swap(cleanupQueue);
6296             }
6297             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6298             {
6299                 deallocate_(*i);
6300             }
6301         }
6302     }
6303     void addToCleanupQueue(UMatData* u) const
6304     {
6305         //TODO: Validation check: CV_Assert(!u->tempUMat());
6306         {
6307             cv::AutoLock lock(cleanupQueueMutex);
6308             cleanupQueue.push_back(u);
6309         }
6310     }
6311 };
6312
6313 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6314 {
6315     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6316     g_isOpenCVActivated = true;
6317     return g_allocator;
6318 }
6319 MatAllocator* getOpenCLAllocator()
6320 {
6321     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6322 }
6323
6324 }} // namespace cv::ocl
6325
6326
6327 namespace cv {
6328
6329 // three funcs below are implemented in umatrix.cpp
6330 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6331               bool autoSteps = false );
6332 void finalizeHdr(UMat& m);
6333
6334 } // namespace cv
6335
6336
6337 namespace cv { namespace ocl {
6338
6339 /*
6340 // Convert OpenCL buffer memory to UMat
6341 */
6342 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6343 {
6344     int d = 2;
6345     int sizes[] = { rows, cols };
6346
6347     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6348
6349     dst.release();
6350
6351     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6352     dst.usageFlags = USAGE_DEFAULT;
6353
6354     setSize(dst, d, sizes, 0, true);
6355     dst.offset = 0;
6356
6357     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6358     cl_mem_object_type mem_type = 0;
6359
6360     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6361
6362     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6363
6364     size_t total = 0;
6365     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6366
6367     CV_OCL_CHECK(clRetainMemObject(memobj));
6368
6369     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6370     CV_Assert(total >= rows * step);
6371
6372     // attach clBuffer to UMatData
6373     dst.u = new UMatData(getOpenCLAllocator());
6374     dst.u->data            = 0;
6375     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6376     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6377     dst.u->handle          = cl_mem_buffer;
6378     dst.u->origdata        = 0;
6379     dst.u->prevAllocator   = 0;
6380     dst.u->size            = total;
6381
6382     finalizeHdr(dst);
6383     dst.addref();
6384
6385     return;
6386 } // convertFromBuffer()
6387
6388
6389 /*
6390 // Convert OpenCL image2d_t memory to UMat
6391 */
6392 void convertFromImage(void* cl_mem_image, UMat& dst)
6393 {
6394     cl_mem             clImage = (cl_mem)cl_mem_image;
6395     cl_mem_object_type mem_type = 0;
6396
6397     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6398
6399     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6400
6401     cl_image_format fmt = { 0, 0 };
6402     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6403
6404     int depth = CV_8U;
6405     switch (fmt.image_channel_data_type)
6406     {
6407     case CL_UNORM_INT8:
6408     case CL_UNSIGNED_INT8:
6409         depth = CV_8U;
6410         break;
6411
6412     case CL_SNORM_INT8:
6413     case CL_SIGNED_INT8:
6414         depth = CV_8S;
6415         break;
6416
6417     case CL_UNORM_INT16:
6418     case CL_UNSIGNED_INT16:
6419         depth = CV_16U;
6420         break;
6421
6422     case CL_SNORM_INT16:
6423     case CL_SIGNED_INT16:
6424         depth = CV_16S;
6425         break;
6426
6427     case CL_SIGNED_INT32:
6428         depth = CV_32S;
6429         break;
6430
6431     case CL_FLOAT:
6432         depth = CV_32F;
6433         break;
6434
6435     default:
6436         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6437     }
6438
6439     int type = CV_8UC1;
6440     switch (fmt.image_channel_order)
6441     {
6442     case CL_R:
6443         type = CV_MAKE_TYPE(depth, 1);
6444         break;
6445
6446     case CL_RGBA:
6447     case CL_BGRA:
6448     case CL_ARGB:
6449         type = CV_MAKE_TYPE(depth, 4);
6450         break;
6451
6452     default:
6453         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6454         break;
6455     }
6456
6457     size_t step = 0;
6458     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6459
6460     size_t w = 0;
6461     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6462
6463     size_t h = 0;
6464     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6465
6466     dst.create((int)h, (int)w, type);
6467
6468     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6469
6470     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6471
6472     size_t offset = 0;
6473     size_t src_origin[3] = { 0, 0, 0 };
6474     size_t region[3] = { w, h, 1 };
6475     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6476
6477     CV_OCL_CHECK(clFinish(q));
6478
6479     return;
6480 } // convertFromImage()
6481
6482
6483 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6484
6485 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6486 {
6487     cl_uint numDevices = 0;
6488     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6489     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6490     {
6491         CV_OCL_DBG_CHECK_RESULT(status,
6492             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6493     }
6494
6495     if (numDevices == 0)
6496     {
6497         devices.clear();
6498         return;
6499     }
6500
6501     devices.resize((size_t)numDevices);
6502     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6503 }
6504
6505 struct PlatformInfo::Impl
6506 {
6507     Impl(void* id)
6508     {
6509         refcount = 1;
6510         handle = *(cl_platform_id*)id;
6511         getDevices(devices, handle);
6512     }
6513
6514     String getStrProp(cl_platform_info prop) const
6515     {
6516         char buf[1024];
6517         size_t sz=0;
6518         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6519             sz < sizeof(buf) ? String(buf) : String();
6520     }
6521
6522     IMPLEMENT_REFCOUNTABLE();
6523     std::vector<cl_device_id> devices;
6524     cl_platform_id handle;
6525 };
6526
6527 PlatformInfo::PlatformInfo()
6528 {
6529     p = 0;
6530 }
6531
6532 PlatformInfo::PlatformInfo(void* platform_id)
6533 {
6534     p = new Impl(platform_id);
6535 }
6536
6537 PlatformInfo::~PlatformInfo()
6538 {
6539     if(p)
6540         p->release();
6541 }
6542
6543 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6544 {
6545     if (i.p)
6546         i.p->addref();
6547     p = i.p;
6548 }
6549
6550 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6551 {
6552     if (i.p != p)
6553     {
6554         if (i.p)
6555             i.p->addref();
6556         if (p)
6557             p->release();
6558         p = i.p;
6559     }
6560     return *this;
6561 }
6562
6563 int PlatformInfo::deviceNumber() const
6564 {
6565     return p ? (int)p->devices.size() : 0;
6566 }
6567
6568 void PlatformInfo::getDevice(Device& device, int d) const
6569 {
6570     CV_Assert(p && d < (int)p->devices.size() );
6571     if(p)
6572         device.set(p->devices[d]);
6573 }
6574
6575 String PlatformInfo::name() const
6576 {
6577     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6578 }
6579
6580 String PlatformInfo::vendor() const
6581 {
6582     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6583 }
6584
6585 String PlatformInfo::version() const
6586 {
6587     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6588 }
6589
6590 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6591 {
6592     cl_uint numPlatforms = 0;
6593     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6594
6595     if (numPlatforms == 0)
6596     {
6597         platforms.clear();
6598         return;
6599     }
6600
6601     platforms.resize((size_t)numPlatforms);
6602     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6603 }
6604
6605 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6606 {
6607     std::vector<cl_platform_id> platforms;
6608     getPlatforms(platforms);
6609
6610     for (size_t i = 0; i < platforms.size(); i++)
6611         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6612 }
6613
6614 const char* typeToStr(int type)
6615 {
6616     static const char* tab[]=
6617     {
6618         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6619         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6620         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6621         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6622         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6623         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6624         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6625         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6626         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6627     };
6628     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6629     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6630     CV_Assert(result);
6631     return result;
6632 }
6633
6634 const char* memopTypeToStr(int type)
6635 {
6636     static const char* tab[] =
6637     {
6638         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6639         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6640         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6641         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6642         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6643         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6644         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6645         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6646         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6647     };
6648     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6649     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6650     CV_Assert(result);
6651     return result;
6652 }
6653
6654 const char* vecopTypeToStr(int type)
6655 {
6656     static const char* tab[] =
6657     {
6658         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6659         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6660         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6661         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6662         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6663         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6664         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6665         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6666         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6667     };
6668     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6669     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6670     CV_Assert(result);
6671     return result;
6672 }
6673
6674 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6675 {
6676     if( sdepth == ddepth )
6677         return "noconvert";
6678     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6679     if( ddepth >= CV_32F ||
6680         (ddepth == CV_32S && sdepth < CV_32S) ||
6681         (ddepth == CV_16S && sdepth <= CV_8S) ||
6682         (ddepth == CV_16U && sdepth == CV_8U))
6683     {
6684         sprintf(buf, "convert_%s", typestr);
6685     }
6686     else if( sdepth >= CV_32F )
6687         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6688     else
6689         sprintf(buf, "convert_%s_sat", typestr);
6690
6691     return buf;
6692 }
6693
6694 const char* getOpenCLErrorString(int errorCode)
6695 {
6696 #define CV_OCL_CODE(id) case id: return #id
6697 #define CV_OCL_CODE_(id, name) case id: return #name
6698     switch (errorCode)
6699     {
6700     CV_OCL_CODE(CL_SUCCESS);
6701     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6702     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6703     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6704     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6705     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6706     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6707     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6708     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6709     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6710     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6711     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6712     CV_OCL_CODE(CL_MAP_FAILURE);
6713     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6714     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6715     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6716     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6717     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6718     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6719     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6720     CV_OCL_CODE(CL_INVALID_VALUE);
6721     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6722     CV_OCL_CODE(CL_INVALID_PLATFORM);
6723     CV_OCL_CODE(CL_INVALID_DEVICE);
6724     CV_OCL_CODE(CL_INVALID_CONTEXT);
6725     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6726     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6727     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6728     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6729     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6730     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6731     CV_OCL_CODE(CL_INVALID_SAMPLER);
6732     CV_OCL_CODE(CL_INVALID_BINARY);
6733     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6734     CV_OCL_CODE(CL_INVALID_PROGRAM);
6735     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6736     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6737     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6738     CV_OCL_CODE(CL_INVALID_KERNEL);
6739     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6740     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6741     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6742     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6743     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6744     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6745     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6746     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6747     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6748     CV_OCL_CODE(CL_INVALID_EVENT);
6749     CV_OCL_CODE(CL_INVALID_OPERATION);
6750     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6751     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6752     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6753     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6754     // OpenCL 1.1
6755     CV_OCL_CODE(CL_INVALID_PROPERTY);
6756     // OpenCL 1.2
6757     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6758     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6759     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6760     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6761     // OpenCL 2.0
6762     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6763     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6764     // Extensions
6765     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6766     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6767     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6768     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6769     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6770     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6771     default: return "Unknown OpenCL error";
6772     }
6773 #undef CV_OCL_CODE
6774 #undef CV_OCL_CODE_
6775 }
6776
6777 template <typename T>
6778 static std::string kerToStr(const Mat & k)
6779 {
6780     int width = k.cols - 1, depth = k.depth();
6781     const T * const data = k.ptr<T>();
6782
6783     std::ostringstream stream;
6784     stream.precision(10);
6785
6786     if (depth <= CV_8S)
6787     {
6788         for (int i = 0; i < width; ++i)
6789             stream << "DIG(" << (int)data[i] << ")";
6790         stream << "DIG(" << (int)data[width] << ")";
6791     }
6792     else if (depth == CV_32F)
6793     {
6794         stream.setf(std::ios_base::showpoint);
6795         for (int i = 0; i < width; ++i)
6796             stream << "DIG(" << data[i] << "f)";
6797         stream << "DIG(" << data[width] << "f)";
6798     }
6799     else
6800     {
6801         for (int i = 0; i < width; ++i)
6802             stream << "DIG(" << data[i] << ")";
6803         stream << "DIG(" << data[width] << ")";
6804     }
6805
6806     return stream.str();
6807 }
6808
6809 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6810 {
6811     Mat kernel = _kernel.getMat().reshape(1, 1);
6812
6813     int depth = kernel.depth();
6814     if (ddepth < 0)
6815         ddepth = depth;
6816
6817     if (ddepth != depth)
6818         kernel.convertTo(kernel, ddepth);
6819
6820     typedef std::string (* func_t)(const Mat &);
6821     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6822                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6823     const func_t func = funcs[ddepth];
6824     CV_Assert(func != 0);
6825
6826     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6827 }
6828
6829 #define PROCESS_SRC(src) \
6830     do \
6831     { \
6832         if (!src.empty()) \
6833         { \
6834             CV_Assert(src.isMat() || src.isUMat()); \
6835             Size csize = src.size(); \
6836             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6837                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6838             if (cwidth < ckercn || ckercn <= 0) \
6839                 return 1; \
6840             cols.push_back(cwidth); \
6841             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6842                 return 1; \
6843             offsets.push_back(src.offset()); \
6844             steps.push_back(src.step()); \
6845             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6846             kercns.push_back(ckercn); \
6847         } \
6848     } \
6849     while ((void)0, 0)
6850
6851 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6852                               InputArray src4, InputArray src5, InputArray src6,
6853                               InputArray src7, InputArray src8, InputArray src9,
6854                               OclVectorStrategy strat)
6855 {
6856     const ocl::Device & d = ocl::Device::getDefault();
6857
6858     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6859         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6860         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6861         d.preferredVectorWidthDouble(), -1 };
6862
6863     // if the device says don't use vectors
6864     if (vectorWidths[0] == 1)
6865     {
6866         // it's heuristic
6867         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6868         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6869         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6870     }
6871
6872     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6873 }
6874
6875 int checkOptimalVectorWidth(const int *vectorWidths,
6876                             InputArray src1, InputArray src2, InputArray src3,
6877                             InputArray src4, InputArray src5, InputArray src6,
6878                             InputArray src7, InputArray src8, InputArray src9,
6879                             OclVectorStrategy strat)
6880 {
6881     CV_Assert(vectorWidths);
6882
6883     int ref_type = src1.type();
6884
6885     std::vector<size_t> offsets, steps, cols;
6886     std::vector<int> dividers, kercns;
6887     PROCESS_SRC(src1);
6888     PROCESS_SRC(src2);
6889     PROCESS_SRC(src3);
6890     PROCESS_SRC(src4);
6891     PROCESS_SRC(src5);
6892     PROCESS_SRC(src6);
6893     PROCESS_SRC(src7);
6894     PROCESS_SRC(src8);
6895     PROCESS_SRC(src9);
6896
6897     size_t size = offsets.size();
6898
6899     for (size_t i = 0; i < size; ++i)
6900         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6901             dividers[i] >>= 1, kercns[i] >>= 1;
6902
6903     // default strategy
6904     int kercn = *std::min_element(kercns.begin(), kercns.end());
6905
6906     return kercn;
6907 }
6908
6909 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6910                                  InputArray src4, InputArray src5, InputArray src6,
6911                                  InputArray src7, InputArray src8, InputArray src9)
6912 {
6913     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6914 }
6915
6916 #undef PROCESS_SRC
6917
6918
6919 // TODO Make this as a method of OpenCL "BuildOptions" class
6920 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6921 {
6922     if (!buildOptions.empty())
6923         buildOptions += " ";
6924     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6925     buildOptions += format(
6926             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6927             name.c_str(), ocl::typeToStr(type),
6928             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6929             name.c_str(), (int)CV_MAT_CN(type),
6930             name.c_str(), (int)CV_ELEM_SIZE(type),
6931             name.c_str(), (int)CV_ELEM_SIZE1(type),
6932             name.c_str(), (int)depth
6933             );
6934 }
6935
6936
6937 struct Image2D::Impl
6938 {
6939     Impl(const UMat &src, bool norm, bool alias)
6940     {
6941         handle = 0;
6942         refcount = 1;
6943         init(src, norm, alias);
6944     }
6945
6946     ~Impl()
6947     {
6948         if (handle)
6949             clReleaseMemObject(handle);
6950     }
6951
6952     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6953     {
6954         cl_image_format format;
6955         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6956                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6957         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6958                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6959         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6960
6961         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6962         int channelOrder = channelOrders[cn];
6963         format.image_channel_data_type = (cl_channel_type)channelType;
6964         format.image_channel_order = (cl_channel_order)channelOrder;
6965         return format;
6966     }
6967
6968     static bool isFormatSupported(cl_image_format format)
6969     {
6970         if (!haveOpenCL())
6971             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6972
6973         cl_context context = (cl_context)Context::getDefault().ptr();
6974         if (!context)
6975             return false;
6976
6977         // Figure out how many formats are supported by this context.
6978         cl_uint numFormats = 0;
6979         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6980                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
6981                                                 NULL, &numFormats);
6982         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
6983         if (numFormats > 0)
6984         {
6985             AutoBuffer<cl_image_format> formats(numFormats);
6986             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6987                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
6988                                              formats.data(), NULL);
6989             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
6990             for (cl_uint i = 0; i < numFormats; ++i)
6991             {
6992                 if (!memcmp(&formats[i], &format, sizeof(format)))
6993                 {
6994                     return true;
6995                 }
6996             }
6997         }
6998         return false;
6999     }
7000
7001     void init(const UMat &src, bool norm, bool alias)
7002     {
7003         if (!haveOpenCL())
7004             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7005
7006         CV_Assert(!src.empty());
7007         CV_Assert(ocl::Device::getDefault().imageSupport());
7008
7009         int err, depth = src.depth(), cn = src.channels();
7010         CV_Assert(cn <= 4);
7011         cl_image_format format = getImageFormat(depth, cn, norm);
7012
7013         if (!isFormatSupported(format))
7014             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7015
7016         if (alias && !src.handle(ACCESS_RW))
7017             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7018
7019         cl_context context = (cl_context)Context::getDefault().ptr();
7020         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7021
7022 #ifdef CL_VERSION_1_2
7023         // this enables backwards portability to
7024         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7025         const Device & d = ocl::Device::getDefault();
7026         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7027         CV_Assert(!alias || canCreateAlias(src));
7028         if (1 < major || (1 == major && 2 <= minor))
7029         {
7030             cl_image_desc desc;
7031             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7032             desc.image_width      = src.cols;
7033             desc.image_height     = src.rows;
7034             desc.image_depth      = 0;
7035             desc.image_array_size = 1;
7036             desc.image_row_pitch  = alias ? src.step[0] : 0;
7037             desc.image_slice_pitch = 0;
7038             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7039             desc.num_mip_levels   = 0;
7040             desc.num_samples      = 0;
7041             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7042         }
7043         else
7044 #endif
7045         {
7046             CV_SUPPRESS_DEPRECATED_START
7047             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7048             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7049             CV_SUPPRESS_DEPRECATED_END
7050         }
7051         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7052
7053         size_t origin[] = { 0, 0, 0 };
7054         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7055
7056         cl_mem devData;
7057         if (!alias && !src.isContinuous())
7058         {
7059             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7060             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7061                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7062                 ).c_str());
7063
7064             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7065             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7066                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7067             CV_OCL_DBG_CHECK(clFlush(queue));
7068         }
7069         else
7070         {
7071             devData = (cl_mem)src.handle(ACCESS_READ);
7072         }
7073         CV_Assert(devData != NULL);
7074
7075         if (!alias)
7076         {
7077             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7078             if (!src.isContinuous())
7079             {
7080                 CV_OCL_DBG_CHECK(clFlush(queue));
7081                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7082             }
7083         }
7084     }
7085
7086     IMPLEMENT_REFCOUNTABLE();
7087
7088     cl_mem handle;
7089 };
7090
7091 Image2D::Image2D()
7092 {
7093     p = NULL;
7094 }
7095
7096 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7097 {
7098     p = new Impl(src, norm, alias);
7099 }
7100
7101 bool Image2D::canCreateAlias(const UMat &m)
7102 {
7103     bool ret = false;
7104     const Device & d = ocl::Device::getDefault();
7105     if (d.imageFromBufferSupport() && !m.empty())
7106     {
7107         // This is the required pitch alignment in pixels
7108         uint pitchAlign = d.imagePitchAlignment();
7109         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7110         {
7111             // We don't currently handle the case where the buffer was created
7112             // with CL_MEM_USE_HOST_PTR
7113             if (!m.u->tempUMat())
7114             {
7115                 ret = true;
7116             }
7117         }
7118     }
7119     return ret;
7120 }
7121
7122 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7123 {
7124     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7125
7126     return Impl::isFormatSupported(format);
7127 }
7128
7129 Image2D::Image2D(const Image2D & i)
7130 {
7131     p = i.p;
7132     if (p)
7133         p->addref();
7134 }
7135
7136 Image2D & Image2D::operator = (const Image2D & i)
7137 {
7138     if (i.p != p)
7139     {
7140         if (i.p)
7141             i.p->addref();
7142         if (p)
7143             p->release();
7144         p = i.p;
7145     }
7146     return *this;
7147 }
7148
7149 Image2D::~Image2D()
7150 {
7151     if (p)
7152         p->release();
7153 }
7154
7155 void* Image2D::ptr() const
7156 {
7157     return p ? p->handle : 0;
7158 }
7159
7160 bool internal::isOpenCLForced()
7161 {
7162     static bool initialized = false;
7163     static bool value = false;
7164     if (!initialized)
7165     {
7166         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7167         initialized = true;
7168     }
7169     return value;
7170 }
7171
7172 bool internal::isPerformanceCheckBypassed()
7173 {
7174     static bool initialized = false;
7175     static bool value = false;
7176     if (!initialized)
7177     {
7178         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7179         initialized = true;
7180     }
7181     return value;
7182 }
7183
7184 bool internal::isCLBuffer(UMat& u)
7185 {
7186     void* h = u.handle(ACCESS_RW);
7187     if (!h)
7188         return true;
7189     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7190 #if 1
7191     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7192         return false;
7193 #else
7194     cl_mem_object_type type = 0;
7195     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7196     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7197         return false;
7198 #endif
7199     return true;
7200 }
7201
7202 struct Timer::Impl
7203 {
7204     const Queue queue;
7205
7206     Impl(const Queue& q)
7207         : queue(q)
7208     {
7209     }
7210
7211     ~Impl(){}
7212
7213     void start()
7214     {
7215         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7216         timer.start();
7217     }
7218
7219     void stop()
7220     {
7221         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7222         timer.stop();
7223     }
7224
7225     uint64 durationNS() const
7226     {
7227         return (uint64)(timer.getTimeSec() * 1e9);
7228     }
7229
7230     TickMeter timer;
7231 };
7232
7233 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7234 Timer::~Timer() { delete p; }
7235
7236 void Timer::start()
7237 {
7238     CV_Assert(p);
7239     p->start();
7240 }
7241
7242 void Timer::stop()
7243 {
7244     CV_Assert(p);
7245     p->stop();
7246 }
7247
7248 uint64 Timer::durationNS() const
7249 {
7250     CV_Assert(p);
7251     return p->durationNS();
7252 }
7253
7254 }} // namespace
7255
7256 #endif // HAVE_OPENCL