Merge pull request #18186 from alalek:opencl_multiple_execution_contexts
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832
833 struct OpenCLExecutionContext::Impl
834 {
835     ocl::Context context_;
836     int device_;  // device index in context
837     ocl::Queue queue_;
838     int useOpenCL_;
839
840 protected:
841     Impl() = delete;
842
843     void _init_device(cl_device_id deviceID)
844     {
845         CV_Assert(deviceID);
846         int ndevices = (int)context_.ndevices();
847         CV_Assert(ndevices > 0);
848         bool found = false;
849         for (int i = 0; i < ndevices; i++)
850         {
851             ocl::Device d = context_.device(i);
852             cl_device_id dhandle = (cl_device_id)d.ptr();
853             if (dhandle == deviceID)
854             {
855                 device_ = i;
856                 found = true;
857                 break;
858             }
859         }
860         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
861     }
862
863     void _init_device(const ocl::Device& device)
864     {
865         CV_Assert(device.ptr());
866         int ndevices = (int)context_.ndevices();
867         CV_Assert(ndevices > 0);
868         bool found = false;
869         for (int i = 0; i < ndevices; i++)
870         {
871             ocl::Device d = context_.device(i);
872             if (d.getImpl() == device.getImpl())
873             {
874                 device_ = i;
875                 found = true;
876                 break;
877             }
878         }
879         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
880     }
881
882 public:
883     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
884         : device_(0), useOpenCL_(-1)
885     {
886         CV_UNUSED(platformID);
887         CV_Assert(context);
888         CV_Assert(deviceID);
889
890         context_ = Context::fromHandle(context);
891         _init_device(deviceID);
892         queue_ = Queue(context_, context_.device(device_));
893     }
894
895     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
896         : device_(0), useOpenCL_(-1)
897     {
898         CV_Assert(context.ptr());
899         CV_Assert(device.ptr());
900
901         context_ = context;
902         _init_device(device);
903         queue_ = queue;
904     }
905
906     Impl(const ocl::Context& context, const ocl::Device& device)
907         : device_(0), useOpenCL_(-1)
908     {
909         CV_Assert(context.ptr());
910         CV_Assert(device.ptr());
911
912         context_ = context;
913         _init_device(device);
914         queue_ = Queue(context_, context_.device(device_));
915     }
916
917     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
918         : context_(context)
919         , device_(device)
920         , queue_(queue)
921         , useOpenCL_(-1)
922     {
923         // nothing
924     }
925     Impl(const Impl& other)
926         : context_(other.context_)
927         , device_(other.device_)
928         , queue_(other.queue_)
929         , useOpenCL_(-1)
930     {
931         // nothing
932     }
933
934     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
935     bool useOpenCL()
936     {
937         if (useOpenCL_ < 0)
938         {
939             try
940             {
941                 useOpenCL_ = 0;
942                 if (!context_.empty() && context_.ndevices() > 0)
943                 {
944                     const Device& d = context_.device(device_);
945                     useOpenCL_ = d.available();
946                 }
947             }
948             catch (const cv::Exception&)
949             {
950                 // nothing
951             }
952             if (!useOpenCL_)
953                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
954         }
955         return useOpenCL_ > 0;
956     }
957
958     void setUseOpenCL(bool flag)
959     {
960         if (!flag)
961             useOpenCL_ = 0;
962         else
963             useOpenCL_ = -1;
964     }
965
966     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
967     {
968         CV_TRACE_FUNCTION();
969
970         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
971
972         static bool initialized = false;
973         static std::shared_ptr<Impl> g_primaryExecutionContext;
974
975         if (!initialized)
976         {
977             cv::AutoLock lock(getInitializationMutex());
978             if (!initialized)
979             {
980                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
981                 try
982                 {
983                     Context c = ocl::Context::create(std::string());
984                     if (c.ndevices())
985                     {
986                         int deviceId = 0;
987                         auto& d = c.device(deviceId);
988                         if (d.available())
989                         {
990                             auto q = ocl::Queue(c, d);
991                             if (!q.ptr())
992                             {
993                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
994                             }
995                             else
996                             {
997                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
998                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
999                             }
1000                         }
1001                         else
1002                         {
1003                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1004                         }
1005                     }
1006                     else
1007                     {
1008                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1009                     }
1010                 }
1011                 catch (const std::exception& e)
1012                 {
1013                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1014                 }
1015                 catch (...)
1016                 {
1017                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1018                 }
1019                 initialized = true;
1020             }
1021         }
1022         return g_primaryExecutionContext;
1023     }
1024 };
1025
1026 Context& OpenCLExecutionContext::getContext() const
1027 {
1028     CV_Assert(p);
1029     return p->context_;
1030 }
1031 Device& OpenCLExecutionContext::getDevice() const
1032 {
1033     CV_Assert(p);
1034     return p->context_.device(p->device_);
1035 }
1036 Queue& OpenCLExecutionContext::getQueue() const
1037 {
1038     CV_Assert(p);
1039     return p->queue_;
1040 }
1041
1042 bool OpenCLExecutionContext::useOpenCL() const
1043 {
1044     if (p)
1045         return p->useOpenCL();
1046     return false;
1047 }
1048 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1049 {
1050     CV_Assert(p);
1051     p->setUseOpenCL(flag);
1052 }
1053
1054 /* static */
1055 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1056 {
1057     CV_TRACE_FUNCTION();
1058     CoreTLSData& data = getCoreTlsData();
1059     OpenCLExecutionContext& c = data.oclExecutionContext;
1060     if (!data.oclExecutionContextInitialized)
1061     {
1062         data.oclExecutionContextInitialized = true;
1063         if (c.empty() && haveOpenCL())
1064             c.p = Impl::getInitializedExecutionContext();
1065     }
1066     return c;
1067 }
1068
1069 /* static */
1070 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1071 {
1072     CV_TRACE_FUNCTION();
1073     CoreTLSData& data = getCoreTlsData();
1074     OpenCLExecutionContext& c = data.oclExecutionContext;
1075     return c;
1076 }
1077
1078 void OpenCLExecutionContext::bind() const
1079 {
1080     CV_TRACE_FUNCTION();
1081     CV_Assert(p);
1082     CoreTLSData& data = getCoreTlsData();
1083     data.oclExecutionContext = *this;
1084     data.oclExecutionContextInitialized = true;
1085     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1086 }
1087
1088
1089 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1090 {
1091     CV_TRACE_FUNCTION();
1092     CV_Assert(p);
1093     const Queue q(getContext(), getDevice());
1094     return cloneWithNewQueue(q);
1095 }
1096
1097 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1098 {
1099     CV_TRACE_FUNCTION();
1100     CV_Assert(p);
1101     CV_Assert(q.ptr() != NULL);
1102     OpenCLExecutionContext c;
1103     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1104     return c;
1105 }
1106
1107 /* static */
1108 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1109 {
1110     CV_TRACE_FUNCTION();
1111     if (!haveOpenCL())
1112         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1113
1114     CV_Assert(!context.empty());
1115     CV_Assert(context.ptr());
1116     CV_Assert(!device.empty());
1117     CV_Assert(device.ptr());
1118     OpenCLExecutionContext ctx;
1119     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1120     return ctx;
1121
1122 }
1123
1124 /* static */
1125 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1126 {
1127     CV_TRACE_FUNCTION();
1128     if (!haveOpenCL())
1129         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1130
1131     CV_Assert(!context.empty());
1132     CV_Assert(context.ptr());
1133     CV_Assert(!device.empty());
1134     CV_Assert(device.ptr());
1135     OpenCLExecutionContext ctx;
1136     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1137     return ctx;
1138
1139 }
1140
1141 void OpenCLExecutionContext::release()
1142 {
1143     CV_TRACE_FUNCTION();
1144     p.reset();
1145 }
1146
1147
1148 // true if we have initialized OpenCL subsystem with available platforms
1149 static bool g_isOpenCVActivated = false;
1150
1151 bool haveOpenCL()
1152 {
1153     CV_TRACE_FUNCTION();
1154     static bool g_isOpenCLInitialized = false;
1155     static bool g_isOpenCLAvailable = false;
1156
1157     if (!g_isOpenCLInitialized)
1158     {
1159         CV_TRACE_REGION("Init_OpenCL_Runtime");
1160         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1161         if (envPath)
1162         {
1163             if (cv::String(envPath) == "disabled")
1164             {
1165                 g_isOpenCLAvailable = false;
1166                 g_isOpenCLInitialized = true;
1167                 return false;
1168             }
1169         }
1170
1171         cv::AutoLock lock(getInitializationMutex());
1172         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1173         try
1174         {
1175             cl_uint n = 0;
1176             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1177             g_isOpenCVActivated = n > 0;
1178             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1179         }
1180         catch (...)
1181         {
1182             g_isOpenCLAvailable = false;
1183         }
1184         g_isOpenCLInitialized = true;
1185     }
1186     return g_isOpenCLAvailable;
1187 }
1188
1189 bool useOpenCL()
1190 {
1191     CoreTLSData& data = getCoreTlsData();
1192     if (data.useOpenCL < 0)
1193     {
1194         try
1195         {
1196             data.useOpenCL = 0;
1197             if (haveOpenCL())
1198             {
1199                 auto c = OpenCLExecutionContext::getCurrent();
1200                 data.useOpenCL = c.useOpenCL();
1201             }
1202         }
1203         catch (...)
1204         {
1205             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1206         }
1207     }
1208     return data.useOpenCL > 0;
1209 }
1210
1211 bool isOpenCLActivated()
1212 {
1213     if (!g_isOpenCVActivated)
1214         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1215     return useOpenCL();
1216 }
1217
1218 void setUseOpenCL(bool flag)
1219 {
1220     CV_TRACE_FUNCTION();
1221
1222     CoreTLSData& data = getCoreTlsData();
1223     auto& c = OpenCLExecutionContext::getCurrentRef();
1224     if (!c.empty())
1225     {
1226         c.setUseOpenCL(flag);
1227         data.useOpenCL = c.useOpenCL();
1228     }
1229     else
1230     {
1231         if (!flag)
1232             data.useOpenCL = 0;
1233         else
1234             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1235     }
1236 }
1237
1238
1239
1240 #ifdef HAVE_CLAMDBLAS
1241
1242 class AmdBlasHelper
1243 {
1244 public:
1245     static AmdBlasHelper & getInstance()
1246     {
1247         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1248     }
1249
1250     bool isAvailable() const
1251     {
1252         return g_isAmdBlasAvailable;
1253     }
1254
1255     ~AmdBlasHelper()
1256     {
1257         try
1258         {
1259             clAmdBlasTeardown();
1260         }
1261         catch (...) { }
1262     }
1263
1264 protected:
1265     AmdBlasHelper()
1266     {
1267         if (!g_isAmdBlasInitialized)
1268         {
1269             AutoLock lock(getInitializationMutex());
1270
1271             if (!g_isAmdBlasInitialized)
1272             {
1273                 if (haveOpenCL())
1274                 {
1275                     try
1276                     {
1277                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1278                     }
1279                     catch (...)
1280                     {
1281                         g_isAmdBlasAvailable = false;
1282                     }
1283                 }
1284                 else
1285                     g_isAmdBlasAvailable = false;
1286
1287                 g_isAmdBlasInitialized = true;
1288             }
1289         }
1290     }
1291
1292 private:
1293     static bool g_isAmdBlasInitialized;
1294     static bool g_isAmdBlasAvailable;
1295 };
1296
1297 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1298 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1299
1300 bool haveAmdBlas()
1301 {
1302     return AmdBlasHelper::getInstance().isAvailable();
1303 }
1304
1305 #else
1306
1307 bool haveAmdBlas()
1308 {
1309     return false;
1310 }
1311
1312 #endif
1313
1314 #ifdef HAVE_CLAMDFFT
1315
1316 class AmdFftHelper
1317 {
1318 public:
1319     static AmdFftHelper & getInstance()
1320     {
1321         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1322     }
1323
1324     bool isAvailable() const
1325     {
1326         return g_isAmdFftAvailable;
1327     }
1328
1329     ~AmdFftHelper()
1330     {
1331         try
1332         {
1333 //            clAmdFftTeardown();
1334         }
1335         catch (...) { }
1336     }
1337
1338 protected:
1339     AmdFftHelper()
1340     {
1341         if (!g_isAmdFftInitialized)
1342         {
1343             AutoLock lock(getInitializationMutex());
1344
1345             if (!g_isAmdFftInitialized)
1346             {
1347                 if (haveOpenCL())
1348                 {
1349                     try
1350                     {
1351                         cl_uint major, minor, patch;
1352                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1353
1354                         // it throws exception in case AmdFft binaries are not found
1355                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1356                         g_isAmdFftAvailable = true;
1357                     }
1358                     catch (const Exception &)
1359                     {
1360                         g_isAmdFftAvailable = false;
1361                     }
1362                 }
1363                 else
1364                     g_isAmdFftAvailable = false;
1365
1366                 g_isAmdFftInitialized = true;
1367             }
1368         }
1369     }
1370
1371 private:
1372     static clAmdFftSetupData setupData;
1373     static bool g_isAmdFftInitialized;
1374     static bool g_isAmdFftAvailable;
1375 };
1376
1377 clAmdFftSetupData AmdFftHelper::setupData;
1378 bool AmdFftHelper::g_isAmdFftAvailable = false;
1379 bool AmdFftHelper::g_isAmdFftInitialized = false;
1380
1381 bool haveAmdFft()
1382 {
1383     return AmdFftHelper::getInstance().isAvailable();
1384 }
1385
1386 #else
1387
1388 bool haveAmdFft()
1389 {
1390     return false;
1391 }
1392
1393 #endif
1394
1395 bool haveSVM()
1396 {
1397 #ifdef HAVE_OPENCL_SVM
1398     return true;
1399 #else
1400     return false;
1401 #endif
1402 }
1403
1404 void finish()
1405 {
1406     Queue::getDefault().finish();
1407 }
1408
1409 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1410
1411 struct Platform::Impl
1412 {
1413     Impl()
1414     {
1415         refcount = 1;
1416         handle = 0;
1417         initialized = false;
1418     }
1419
1420     ~Impl() {}
1421
1422     void init()
1423     {
1424         if( !initialized )
1425         {
1426             //cl_uint num_entries
1427             cl_uint n = 0;
1428             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1429                 handle = 0;
1430             if( handle != 0 )
1431             {
1432                 char buf[1000];
1433                 size_t len = 0;
1434                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1435                 buf[len] = '\0';
1436                 vendor = String(buf);
1437             }
1438
1439             initialized = true;
1440         }
1441     }
1442
1443     IMPLEMENT_REFCOUNTABLE();
1444
1445     cl_platform_id handle;
1446     String vendor;
1447     bool initialized;
1448 };
1449
1450 Platform::Platform()
1451 {
1452     p = 0;
1453 }
1454
1455 Platform::~Platform()
1456 {
1457     if(p)
1458         p->release();
1459 }
1460
1461 Platform::Platform(const Platform& pl)
1462 {
1463     p = (Impl*)pl.p;
1464     if(p)
1465         p->addref();
1466 }
1467
1468 Platform& Platform::operator = (const Platform& pl)
1469 {
1470     Impl* newp = (Impl*)pl.p;
1471     if(newp)
1472         newp->addref();
1473     if(p)
1474         p->release();
1475     p = newp;
1476     return *this;
1477 }
1478
1479 void* Platform::ptr() const
1480 {
1481     return p ? p->handle : 0;
1482 }
1483
1484 Platform& Platform::getDefault()
1485 {
1486     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1487     static Platform p;
1488     if( !p.p )
1489     {
1490         p.p = new Impl;
1491         p.p->init();
1492     }
1493     return p;
1494 }
1495
1496 /////////////////////////////////////// Device ////////////////////////////////////////////
1497
1498 // deviceVersion has format
1499 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1500 // by specification
1501 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1502 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1503 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1504 {
1505     major = minor = 0;
1506     if (10 >= deviceVersion.length())
1507         return;
1508     const char *pstr = deviceVersion.c_str();
1509     if (0 != strncmp(pstr, "OpenCL ", 7))
1510         return;
1511     size_t ppos = deviceVersion.find('.', 7);
1512     if (String::npos == ppos)
1513         return;
1514     String temp = deviceVersion.substr(7, ppos - 7);
1515     major = atoi(temp.c_str());
1516     temp = deviceVersion.substr(ppos + 1);
1517     minor = atoi(temp.c_str());
1518 }
1519
1520 struct Device::Impl
1521 {
1522     Impl(void* d)
1523         : refcount(1)
1524         , handle(0)
1525     {
1526         try
1527         {
1528             cl_device_id device = (cl_device_id)d;
1529             _init(device);
1530             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1531         }
1532         catch (...)
1533         {
1534             throw;
1535         }
1536     }
1537
1538     void _init(cl_device_id d)
1539     {
1540         handle = (cl_device_id)d;
1541
1542         name_ = getStrProp(CL_DEVICE_NAME);
1543         version_ = getStrProp(CL_DEVICE_VERSION);
1544         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1545         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1546         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1547         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1548         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1549         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1550         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1551         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1552
1553         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1554         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1555
1556         size_t pos = 0;
1557         while (pos < extensions_.size())
1558         {
1559             size_t pos2 = extensions_.find(' ', pos);
1560             if (pos2 == String::npos)
1561                 pos2 = extensions_.size();
1562             if (pos2 > pos)
1563             {
1564                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1565                 extensions_set_.insert(extensionName);
1566             }
1567             pos = pos2 + 1;
1568         }
1569
1570         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1571
1572         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1573         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1574             vendorName_ == "AMD")
1575             vendorID_ = VENDOR_AMD;
1576         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1577             vendorID_ = VENDOR_INTEL;
1578         else if (vendorName_ == "NVIDIA Corporation")
1579             vendorID_ = VENDOR_NVIDIA;
1580         else
1581             vendorID_ = UNKNOWN_VENDOR;
1582
1583         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1584         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1585         {
1586             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1587             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1588                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1589             maxWorkGroupSize_ = new_maxWorkGroupSize;
1590         }
1591 #if 0
1592         if (isExtensionSupported("cl_khr_spir"))
1593         {
1594 #ifndef CL_DEVICE_SPIR_VERSIONS
1595 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1596 #endif
1597             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1598             std::cout << spir_versions << std::endl;
1599         }
1600 #endif
1601     }
1602
1603     ~Impl()
1604     {
1605 #ifdef _WIN32
1606         if (!cv::__termination)
1607 #endif
1608         {
1609             if (handle)
1610             {
1611                 CV_OCL_CHECK(clReleaseDevice(handle));
1612                 handle = 0;
1613             }
1614         }
1615     }
1616
1617     template<typename _TpCL, typename _TpOut>
1618     _TpOut getProp(cl_device_info prop) const
1619     {
1620         _TpCL temp=_TpCL();
1621         size_t sz = 0;
1622
1623         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1624             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1625     }
1626
1627     bool getBoolProp(cl_device_info prop) const
1628     {
1629         cl_bool temp = CL_FALSE;
1630         size_t sz = 0;
1631
1632         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1633             sz == sizeof(temp) ? temp != 0 : false;
1634     }
1635
1636     String getStrProp(cl_device_info prop) const
1637     {
1638         char buf[4096];
1639         size_t sz=0;
1640         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1641             sz < sizeof(buf) ? String(buf) : String();
1642     }
1643
1644     bool isExtensionSupported(const std::string& extensionName) const
1645     {
1646         return extensions_set_.count(extensionName) > 0;
1647     }
1648
1649
1650     IMPLEMENT_REFCOUNTABLE();
1651
1652     cl_device_id handle;
1653
1654     String name_;
1655     String version_;
1656     std::string extensions_;
1657     int doubleFPConfig_;
1658     bool hostUnifiedMemory_;
1659     int maxComputeUnits_;
1660     size_t maxWorkGroupSize_;
1661     int type_;
1662     int addressBits_;
1663     int deviceVersionMajor_;
1664     int deviceVersionMinor_;
1665     String driverVersion_;
1666     String vendorName_;
1667     int vendorID_;
1668     bool intelSubgroupsSupport_;
1669
1670     std::set<std::string> extensions_set_;
1671 };
1672
1673
1674 Device::Device()
1675 {
1676     p = 0;
1677 }
1678
1679 Device::Device(void* d)
1680 {
1681     p = 0;
1682     set(d);
1683 }
1684
1685 Device::Device(const Device& d)
1686 {
1687     p = d.p;
1688     if(p)
1689         p->addref();
1690 }
1691
1692 Device& Device::operator = (const Device& d)
1693 {
1694     Impl* newp = (Impl*)d.p;
1695     if(newp)
1696         newp->addref();
1697     if(p)
1698         p->release();
1699     p = newp;
1700     return *this;
1701 }
1702
1703 Device::~Device()
1704 {
1705     if(p)
1706         p->release();
1707 }
1708
1709 void Device::set(void* d)
1710 {
1711     if(p)
1712         p->release();
1713     p = new Impl(d);
1714     if (p->handle)
1715     {
1716         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1717     }
1718 }
1719
1720 Device Device::fromHandle(void* d)
1721 {
1722     Device device(d);
1723     return device;
1724 }
1725
1726 void* Device::ptr() const
1727 {
1728     return p ? p->handle : 0;
1729 }
1730
1731 String Device::name() const
1732 { return p ? p->name_ : String(); }
1733
1734 String Device::extensions() const
1735 { return p ? String(p->extensions_) : String(); }
1736
1737 bool Device::isExtensionSupported(const String& extensionName) const
1738 { return p ? p->isExtensionSupported(extensionName) : false; }
1739
1740 String Device::version() const
1741 { return p ? p->version_ : String(); }
1742
1743 String Device::vendorName() const
1744 { return p ? p->vendorName_ : String(); }
1745
1746 int Device::vendorID() const
1747 { return p ? p->vendorID_ : 0; }
1748
1749 String Device::OpenCL_C_Version() const
1750 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1751
1752 String Device::OpenCLVersion() const
1753 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1754
1755 int Device::deviceVersionMajor() const
1756 { return p ? p->deviceVersionMajor_ : 0; }
1757
1758 int Device::deviceVersionMinor() const
1759 { return p ? p->deviceVersionMinor_ : 0; }
1760
1761 String Device::driverVersion() const
1762 { return p ? p->driverVersion_ : String(); }
1763
1764 int Device::type() const
1765 { return p ? p->type_ : 0; }
1766
1767 int Device::addressBits() const
1768 { return p ? p->addressBits_ : 0; }
1769
1770 bool Device::available() const
1771 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1772
1773 bool Device::compilerAvailable() const
1774 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1775
1776 bool Device::linkerAvailable() const
1777 #ifdef CL_VERSION_1_2
1778 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1779 #else
1780 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1781 #endif
1782
1783 int Device::doubleFPConfig() const
1784 { return p ? p->doubleFPConfig_ : 0; }
1785
1786 int Device::singleFPConfig() const
1787 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1788
1789 int Device::halfFPConfig() const
1790 #ifdef CL_VERSION_1_2
1791 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1792 #else
1793 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1794 #endif
1795
1796 bool Device::endianLittle() const
1797 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1798
1799 bool Device::errorCorrectionSupport() const
1800 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1801
1802 int Device::executionCapabilities() const
1803 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1804
1805 size_t Device::globalMemCacheSize() const
1806 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1807
1808 int Device::globalMemCacheType() const
1809 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1810
1811 int Device::globalMemCacheLineSize() const
1812 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1813
1814 size_t Device::globalMemSize() const
1815 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1816
1817 size_t Device::localMemSize() const
1818 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1819
1820 int Device::localMemType() const
1821 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1822
1823 bool Device::hostUnifiedMemory() const
1824 { return p ? p->hostUnifiedMemory_ : false; }
1825
1826 bool Device::imageSupport() const
1827 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1828
1829 bool Device::imageFromBufferSupport() const
1830 {
1831     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1832 }
1833
1834 uint Device::imagePitchAlignment() const
1835 {
1836 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1837     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1838 #else
1839     return 0;
1840 #endif
1841 }
1842
1843 uint Device::imageBaseAddressAlignment() const
1844 {
1845 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1846     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1847 #else
1848     return 0;
1849 #endif
1850 }
1851
1852 size_t Device::image2DMaxWidth() const
1853 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1854
1855 size_t Device::image2DMaxHeight() const
1856 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1857
1858 size_t Device::image3DMaxWidth() const
1859 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1860
1861 size_t Device::image3DMaxHeight() const
1862 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1863
1864 size_t Device::image3DMaxDepth() const
1865 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1866
1867 size_t Device::imageMaxBufferSize() const
1868 #ifdef CL_VERSION_1_2
1869 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1870 #else
1871 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1872 #endif
1873
1874 size_t Device::imageMaxArraySize() const
1875 #ifdef CL_VERSION_1_2
1876 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1877 #else
1878 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1879 #endif
1880
1881 bool Device::intelSubgroupsSupport() const
1882 { return p ? p->intelSubgroupsSupport_ : false; }
1883
1884 int Device::maxClockFrequency() const
1885 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1886
1887 int Device::maxComputeUnits() const
1888 { return p ? p->maxComputeUnits_ : 0; }
1889
1890 int Device::maxConstantArgs() const
1891 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1892
1893 size_t Device::maxConstantBufferSize() const
1894 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1895
1896 size_t Device::maxMemAllocSize() const
1897 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1898
1899 size_t Device::maxParameterSize() const
1900 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1901
1902 int Device::maxReadImageArgs() const
1903 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1904
1905 int Device::maxWriteImageArgs() const
1906 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1907
1908 int Device::maxSamplers() const
1909 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1910
1911 size_t Device::maxWorkGroupSize() const
1912 { return p ? p->maxWorkGroupSize_ : 0; }
1913
1914 int Device::maxWorkItemDims() const
1915 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1916
1917 void Device::maxWorkItemSizes(size_t* sizes) const
1918 {
1919     if(p)
1920     {
1921         const int MAX_DIMS = 32;
1922         size_t retsz = 0;
1923         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1924                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1925     }
1926 }
1927
1928 int Device::memBaseAddrAlign() const
1929 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1930
1931 int Device::nativeVectorWidthChar() const
1932 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1933
1934 int Device::nativeVectorWidthShort() const
1935 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1936
1937 int Device::nativeVectorWidthInt() const
1938 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1939
1940 int Device::nativeVectorWidthLong() const
1941 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1942
1943 int Device::nativeVectorWidthFloat() const
1944 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1945
1946 int Device::nativeVectorWidthDouble() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1948
1949 int Device::nativeVectorWidthHalf() const
1950 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1951
1952 int Device::preferredVectorWidthChar() const
1953 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1954
1955 int Device::preferredVectorWidthShort() const
1956 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1957
1958 int Device::preferredVectorWidthInt() const
1959 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1960
1961 int Device::preferredVectorWidthLong() const
1962 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1963
1964 int Device::preferredVectorWidthFloat() const
1965 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1966
1967 int Device::preferredVectorWidthDouble() const
1968 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1969
1970 int Device::preferredVectorWidthHalf() const
1971 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1972
1973 size_t Device::printfBufferSize() const
1974 #ifdef CL_VERSION_1_2
1975 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1976 #else
1977 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1978 #endif
1979
1980
1981 size_t Device::profilingTimerResolution() const
1982 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1983
1984 const Device& Device::getDefault()
1985 {
1986     auto& c = OpenCLExecutionContext::getCurrent();
1987     if (!c.empty())
1988     {
1989         return c.getDevice();
1990     }
1991
1992     static Device dummy;
1993     return dummy;
1994 }
1995
1996 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1997
1998 template <typename Functor, typename ObjectType>
1999 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2000 {
2001     ::size_t required;
2002     cl_int err = f(obj, name, 0, NULL, &required);
2003     if (err != CL_SUCCESS)
2004         return err;
2005
2006     param.clear();
2007     if (required > 0)
2008     {
2009         AutoBuffer<char> buf(required + 1);
2010         char* ptr = buf.data(); // cleanup is not needed
2011         err = f(obj, name, required, ptr, NULL);
2012         if (err != CL_SUCCESS)
2013             return err;
2014         param = ptr;
2015     }
2016
2017     return CL_SUCCESS;
2018 }
2019
2020 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2021 {
2022     elems.clear();
2023     if (s.size() == 0)
2024         return;
2025     std::istringstream ss(s);
2026     std::string item;
2027     while (!ss.eof())
2028     {
2029         std::getline(ss, item, delim);
2030         elems.push_back(item);
2031     }
2032 }
2033
2034 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2035 // Sample: AMD:GPU:
2036 // Sample: AMD:GPU:Tahiti
2037 // Sample: :GPU|CPU: = '' = ':' = '::'
2038 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2039         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2040 {
2041     std::vector<std::string> parts;
2042     split(configurationStr, ':', parts);
2043     if (parts.size() > 3)
2044     {
2045         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2046         return false;
2047     }
2048     if (parts.size() > 2)
2049         deviceNameOrID = parts[2];
2050     if (parts.size() > 1)
2051     {
2052         split(parts[1], '|', deviceTypes);
2053     }
2054     if (parts.size() > 0)
2055     {
2056         platform = parts[0];
2057     }
2058     return true;
2059 }
2060
2061 #if defined WINRT || defined _WIN32_WCE
2062 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2063 {
2064     CV_UNUSED(configuration)
2065     return NULL;
2066 }
2067 #else
2068 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2069 {
2070     std::string platform, deviceName;
2071     std::vector<std::string> deviceTypes;
2072
2073     if (!configuration)
2074         configuration = getenv("OPENCV_OPENCL_DEVICE");
2075
2076     if (configuration &&
2077             (strcmp(configuration, "disabled") == 0 ||
2078              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2079             ))
2080         return NULL;
2081
2082     bool isID = false;
2083     int deviceID = -1;
2084     if (deviceName.length() == 1)
2085     // We limit ID range to 0..9, because we want to write:
2086     // - '2500' to mean i5-2500
2087     // - '8350' to mean AMD FX-8350
2088     // - '650' to mean GeForce 650
2089     // To extend ID range change condition to '> 0'
2090     {
2091         isID = true;
2092         for (size_t i = 0; i < deviceName.length(); i++)
2093         {
2094             if (!isdigit(deviceName[i]))
2095             {
2096                 isID = false;
2097                 break;
2098             }
2099         }
2100         if (isID)
2101         {
2102             deviceID = atoi(deviceName.c_str());
2103             if (deviceID < 0)
2104                 return NULL;
2105         }
2106     }
2107
2108     std::vector<cl_platform_id> platforms;
2109     {
2110         cl_uint numPlatforms = 0;
2111         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2112
2113         if (numPlatforms == 0)
2114             return NULL;
2115         platforms.resize((size_t)numPlatforms);
2116         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2117         platforms.resize(numPlatforms);
2118     }
2119
2120     int selectedPlatform = -1;
2121     if (platform.length() > 0)
2122     {
2123         for (size_t i = 0; i < platforms.size(); i++)
2124         {
2125             std::string name;
2126             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2127             if (name.find(platform) != std::string::npos)
2128             {
2129                 selectedPlatform = (int)i;
2130                 break;
2131             }
2132         }
2133         if (selectedPlatform == -1)
2134         {
2135             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2136             goto not_found;
2137         }
2138     }
2139     if (deviceTypes.size() == 0)
2140     {
2141         if (!isID)
2142         {
2143             deviceTypes.push_back("GPU");
2144             if (configuration)
2145                 deviceTypes.push_back("CPU");
2146         }
2147         else
2148             deviceTypes.push_back("ALL");
2149     }
2150     for (size_t t = 0; t < deviceTypes.size(); t++)
2151     {
2152         int deviceType = 0;
2153         std::string tempStrDeviceType = deviceTypes[t];
2154         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2155
2156         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2157             deviceType = Device::TYPE_GPU;
2158         else if (tempStrDeviceType == "cpu")
2159             deviceType = Device::TYPE_CPU;
2160         else if (tempStrDeviceType == "accelerator")
2161             deviceType = Device::TYPE_ACCELERATOR;
2162         else if (tempStrDeviceType == "all")
2163             deviceType = Device::TYPE_ALL;
2164         else
2165         {
2166             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2167             goto not_found;
2168         }
2169
2170         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2171         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2172                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2173                 i++)
2174         {
2175             cl_uint count = 0;
2176             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2177             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2178             {
2179                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2180             }
2181             if (count == 0)
2182                 continue;
2183             size_t base = devices.size();
2184             devices.resize(base + count);
2185             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2186             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2187             {
2188                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2189             }
2190         }
2191
2192         for (size_t i = (isID ? deviceID : 0);
2193              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2194              i++)
2195         {
2196             std::string name;
2197             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2198             cl_bool useGPU = true;
2199             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2200             {
2201                 cl_bool isIGPU = CL_FALSE;
2202                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2203                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2204             }
2205             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2206             {
2207                 // TODO check for OpenCL 1.1
2208                 return devices[i];
2209             }
2210         }
2211     }
2212
2213 not_found:
2214     if (!configuration)
2215         return NULL; // suppress messages on stderr
2216
2217     std::ostringstream msg;
2218     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2219         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2220         << "    Device types:";
2221     for (size_t t = 0; t < deviceTypes.size(); t++)
2222         msg << ' ' << deviceTypes[t];
2223
2224     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2225
2226     CV_LOG_ERROR(NULL, msg.str());
2227     return NULL;
2228 }
2229 #endif
2230
2231 #ifdef HAVE_OPENCL_SVM
2232 namespace svm {
2233
2234 enum AllocatorFlags { // don't use first 16 bits
2235         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2236         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2237         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2238         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2239         OPENCL_SVM_BUFFER_MAP = 4 << 16
2240 };
2241
2242 static bool checkForceSVMUmatUsage()
2243 {
2244     static bool initialized = false;
2245     static bool force = false;
2246     if (!initialized)
2247     {
2248         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2249         initialized = true;
2250     }
2251     return force;
2252 }
2253 static bool checkDisableSVMUMatUsage()
2254 {
2255     static bool initialized = false;
2256     static bool force = false;
2257     if (!initialized)
2258     {
2259         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2260         initialized = true;
2261     }
2262     return force;
2263 }
2264 static bool checkDisableSVM()
2265 {
2266     static bool initialized = false;
2267     static bool force = false;
2268     if (!initialized)
2269     {
2270         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2271         initialized = true;
2272     }
2273     return force;
2274 }
2275 // see SVMCapabilities
2276 static unsigned int getSVMCapabilitiesMask()
2277 {
2278     static bool initialized = false;
2279     static unsigned int mask = 0;
2280     if (!initialized)
2281     {
2282         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2283         if (envValue == NULL)
2284         {
2285             return ~0U; // all bits 1
2286         }
2287         mask = atoi(envValue);
2288         initialized = true;
2289     }
2290     return mask;
2291 }
2292 } // namespace
2293 #endif
2294
2295 static size_t getProgramCountLimit()
2296 {
2297     static bool initialized = false;
2298     static size_t count = 0;
2299     if (!initialized)
2300     {
2301         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2302         initialized = true;
2303     }
2304     return count;
2305 }
2306
2307 static int g_contextId = 0;
2308
2309 class OpenCLBufferPoolImpl;
2310 class OpenCLSVMBufferPoolImpl;
2311
2312 struct Context::Impl
2313 {
2314     static Context::Impl* get(Context& context) { return context.p; }
2315
2316     typedef std::deque<Context::Impl*> container_t;
2317     static container_t& getGlobalContainer()
2318     {
2319         static container_t g_contexts;
2320         return g_contexts;
2321     }
2322
2323 protected:
2324     Impl(const std::string& configuration_)
2325         : refcount(1)
2326         , contextId(CV_XADD(&g_contextId, 1))
2327         , configuration(configuration_)
2328         , handle(0)
2329 #ifdef HAVE_OPENCL_SVM
2330         , svmInitialized(false)
2331 #endif
2332     {
2333         if (!haveOpenCL())
2334             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2335
2336         cv::AutoLock lock(cv::getInitializationMutex());
2337         auto& container = getGlobalContainer();
2338         container.resize(std::max(container.size(), (size_t)contextId + 1));
2339         container[contextId] = this;
2340     }
2341
2342     ~Impl()
2343     {
2344 #ifdef _WIN32
2345         if (!cv::__termination)
2346 #endif
2347         {
2348             if (handle)
2349             {
2350                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2351                 handle = NULL;
2352             }
2353             devices.clear();
2354         }
2355
2356         {
2357             cv::AutoLock lock(cv::getInitializationMutex());
2358             auto& container = getGlobalContainer();
2359             CV_Assert((size_t)contextId < container.size());
2360             container[contextId] = NULL;
2361         }
2362     }
2363
2364     void init_device_list()
2365     {
2366         CV_Assert(handle);
2367
2368         cl_uint ndevices = 0;
2369         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2370         CV_Assert(ndevices > 0);
2371
2372         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2373         size_t devices_ret_size = 0;
2374         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2375         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2376
2377         devices.clear();
2378         for (unsigned i = 0; i < ndevices; i++)
2379         {
2380             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2381         }
2382     }
2383
2384     void __init_buffer_pools();  // w/o synchronization
2385     void _init_buffer_pools() const
2386     {
2387         if (!bufferPool_)
2388         {
2389             cv::AutoLock lock(cv::getInitializationMutex());
2390             if (!bufferPool_)
2391             {
2392                 const_cast<Impl*>(this)->__init_buffer_pools();
2393             }
2394         }
2395     }
2396 public:
2397     static Impl* findContext(const std::string& configuration)
2398     {
2399         CV_TRACE_FUNCTION();
2400         cv::AutoLock lock(cv::getInitializationMutex());
2401         auto& container = getGlobalContainer();
2402         if (configuration.empty() && !container.empty())
2403             return container[0];
2404         for (auto it = container.begin(); it != container.end(); ++it)
2405         {
2406             Impl* i = *it;
2407             if (i && i->configuration == configuration)
2408             {
2409                 return i;
2410             }
2411         }
2412         return NULL;
2413     }
2414
2415     static Impl* findOrCreateContext(const std::string& configuration_)
2416     {
2417         CV_TRACE_FUNCTION();
2418         std::string configuration = configuration_;
2419         if (configuration_.empty())
2420         {
2421             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2422             if (c)
2423                 configuration = c;
2424         }
2425         Impl* impl = findContext(configuration);
2426         if (impl)
2427         {
2428             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2429             return impl;
2430         }
2431
2432         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2433         if (d == NULL)
2434             return NULL;
2435
2436         impl = new Impl(configuration);
2437         try
2438         {
2439             impl->createFromDevice(d);
2440             if (impl->handle)
2441                 return impl;
2442             delete impl;
2443             return NULL;
2444         }
2445         catch (...)
2446         {
2447             delete impl;
2448             throw;
2449         }
2450     }
2451
2452     static Impl* findOrCreateContext(cl_context h)
2453     {
2454         CV_TRACE_FUNCTION();
2455
2456         CV_Assert(h);
2457
2458         std::string configuration = cv::format("@ctx-%p", (void*)h);
2459         Impl* impl = findContext(configuration);
2460         if (impl)
2461         {
2462             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2463             impl->addref();
2464             return impl;
2465         }
2466
2467         impl = new Impl(configuration);
2468         try
2469         {
2470             CV_OCL_CHECK(clRetainContext(h));
2471             impl->handle = h;
2472             impl->init_device_list();
2473             return impl;
2474         }
2475         catch (...)
2476         {
2477             delete impl;
2478             throw;
2479         }
2480     }
2481
2482     static Impl* findOrCreateContext(const ocl::Device& device)
2483     {
2484         CV_TRACE_FUNCTION();
2485
2486         CV_Assert(!device.empty());
2487         cl_device_id d = (cl_device_id)device.ptr();
2488         CV_Assert(d);
2489
2490         std::string configuration = cv::format("@dev-%p", (void*)d);
2491         Impl* impl = findContext(configuration);
2492         if (impl)
2493         {
2494             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2495             impl->addref();
2496             return impl;
2497         }
2498
2499         impl = new Impl(configuration);
2500         try
2501         {
2502             impl->createFromDevice(d);
2503             CV_Assert(impl->handle);
2504             return impl;
2505         }
2506         catch (...)
2507         {
2508             delete impl;
2509             throw;
2510         }
2511     }
2512
2513     void setDefault()
2514     {
2515         CV_TRACE_FUNCTION();
2516         cl_device_id d = selectOpenCLDevice();
2517
2518         if (d == NULL)
2519             return;
2520
2521         createFromDevice(d);
2522     }
2523
2524     void createFromDevice(cl_device_id d)
2525     {
2526         CV_TRACE_FUNCTION();
2527         CV_Assert(handle == NULL);
2528
2529         cl_platform_id pl = NULL;
2530         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2531
2532         cl_context_properties prop[] =
2533         {
2534             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2535             0
2536         };
2537
2538         // !!! in the current implementation force the number of devices to 1 !!!
2539         cl_uint nd = 1;
2540         cl_int status;
2541
2542         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2543         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2544
2545         bool ok = handle != 0 && status == CL_SUCCESS;
2546         if( ok )
2547         {
2548             devices.resize(nd);
2549             devices[0].set(d);
2550         }
2551         else
2552             handle = NULL;
2553     }
2554
2555     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2556
2557     void unloadProg(Program& prog)
2558     {
2559         cv::AutoLock lock(program_cache_mutex);
2560         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2561         {
2562               phash_t::iterator it = phash.find(*i);
2563               if (it != phash.end())
2564               {
2565                   if (it->second.ptr() == prog.ptr())
2566                   {
2567                       phash.erase(*i);
2568                       cacheList.erase(i);
2569                       return;
2570                   }
2571               }
2572         }
2573     }
2574
2575     std::string& getPrefixString()
2576     {
2577         if (prefix.empty())
2578         {
2579             cv::AutoLock lock(program_cache_mutex);
2580             if (prefix.empty())
2581             {
2582                 CV_Assert(!devices.empty());
2583                 const Device& d = devices[0];
2584                 int bits = d.addressBits();
2585                 if (bits > 0 && bits != 64)
2586                     prefix = cv::format("%d-bit--", bits);
2587                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2588                 // sanitize chars
2589                 for (size_t i = 0; i < prefix.size(); i++)
2590                 {
2591                     char c = prefix[i];
2592                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2593                     {
2594                         prefix[i] = '_';
2595                     }
2596                 }
2597             }
2598         }
2599         return prefix;
2600     }
2601
2602     std::string& getPrefixBase()
2603     {
2604         if (prefix_base.empty())
2605         {
2606             cv::AutoLock lock(program_cache_mutex);
2607             if (prefix_base.empty())
2608             {
2609                 const Device& d = devices[0];
2610                 int bits = d.addressBits();
2611                 if (bits > 0 && bits != 64)
2612                     prefix_base = cv::format("%d-bit--", bits);
2613                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2614                 // sanitize chars
2615                 for (size_t i = 0; i < prefix_base.size(); i++)
2616                 {
2617                     char c = prefix_base[i];
2618                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2619                     {
2620                         prefix_base[i] = '_';
2621                     }
2622                 }
2623             }
2624         }
2625         return prefix_base;
2626     }
2627
2628     IMPLEMENT_REFCOUNTABLE();
2629
2630     const int contextId;  // global unique ID
2631     const std::string configuration;
2632
2633     cl_context handle;
2634     std::vector<Device> devices;
2635
2636     std::string prefix;
2637     std::string prefix_base;
2638
2639     cv::Mutex program_cache_mutex;
2640     typedef std::map<std::string, Program> phash_t;
2641     phash_t phash;
2642     typedef std::list<cv::String> CacheList;
2643     CacheList cacheList;
2644
2645     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2646     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2647     OpenCLBufferPoolImpl& getBufferPool() const
2648     {
2649         _init_buffer_pools();
2650         CV_DbgAssert(bufferPool_);
2651         return *bufferPool_.get();
2652     }
2653     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2654     {
2655         _init_buffer_pools();
2656         CV_DbgAssert(bufferPoolHostPtr_);
2657         return *bufferPoolHostPtr_.get();
2658     }
2659
2660 #ifdef HAVE_OPENCL_SVM
2661     bool svmInitialized;
2662     bool svmAvailable;
2663     bool svmEnabled;
2664     svm::SVMCapabilities svmCapabilities;
2665     svm::SVMFunctions svmFunctions;
2666
2667     void svmInit()
2668     {
2669         CV_Assert(handle != NULL);
2670         const Device& device = devices[0];
2671         cl_device_svm_capabilities deviceCaps = 0;
2672         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2673         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2674         if (status != CL_SUCCESS)
2675         {
2676             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2677             goto noSVM;
2678         }
2679         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2680         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2681         svmCapabilities.value_ =
2682                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2683                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2684                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2685                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2686         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2687         if (svmCapabilities.value_ == 0)
2688         {
2689             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2690             goto noSVM;
2691         }
2692         try
2693         {
2694             // Try OpenCL 2.0
2695             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2696             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2697             if (!ptr)
2698             {
2699                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2700                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2701             }
2702             try
2703             {
2704                 bool error = false;
2705                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2706                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2707                 {
2708                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2709                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2710                 }
2711                 clFinish(q);
2712                 try
2713                 {
2714                     ((int*)ptr)[0] = 100;
2715                 }
2716                 catch (...)
2717                 {
2718                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2719                     error = true;
2720                 }
2721                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2722                 {
2723                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2724                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2725                 }
2726                 clFinish(q);
2727                 if (error)
2728                 {
2729                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2730                 }
2731             }
2732             catch (...)
2733             {
2734                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2735                 clSVMFree(handle, ptr);
2736                 throw;
2737             }
2738             clSVMFree(handle, ptr);
2739             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2740             svmFunctions.fn_clSVMFree = clSVMFree;
2741             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2742             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2743             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2744             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2745             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2746             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2747             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2748         }
2749         catch (...)
2750         {
2751             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2752             try
2753             {
2754                 // Try HSA extension
2755                 String extensions = device.extensions();
2756                 if (extensions.find("cl_amd_svm") == String::npos)
2757                 {
2758                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2759                     goto noSVM;
2760                 }
2761                 cl_platform_id p = NULL;
2762                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2763                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2764                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2765                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2766                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2767                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2768                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2769                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2770                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2771                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2772                 CV_Assert(svmFunctions.isValid());
2773             }
2774             catch (...)
2775             {
2776                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2777                 goto noSVM;
2778             }
2779         }
2780
2781         svmAvailable = true;
2782         svmEnabled = !svm::checkDisableSVM();
2783         svmInitialized = true;
2784         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2785         return;
2786     noSVM:
2787         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2788         svmAvailable = false;
2789         svmEnabled = false;
2790         svmCapabilities.value_ = 0;
2791         svmInitialized = true;
2792         svmFunctions.fn_clSVMAlloc = NULL;
2793         return;
2794     }
2795
2796     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2797
2798     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2799     {
2800         _init_buffer_pools();
2801         CV_DbgAssert(bufferPoolSVM_);
2802         return *bufferPoolSVM_.get();
2803     }
2804 #endif
2805
2806     friend class Program;
2807 };
2808
2809
2810 Context::Context()
2811 {
2812     p = 0;
2813 }
2814
2815 Context::~Context()
2816 {
2817     release();
2818 }
2819
2820 // deprecated
2821 Context::Context(int dtype)
2822 {
2823     p = 0;
2824     create(dtype);
2825 }
2826
2827 void Context::release()
2828 {
2829     if (p)
2830     {
2831         p->release();
2832         p = NULL;
2833     }
2834 }
2835
2836 bool Context::create()
2837 {
2838     release();
2839     if (!haveOpenCL())
2840         return false;
2841     p = Impl::findOrCreateContext(std::string());
2842     if (p->handle)
2843         return true;
2844     release();
2845     return false;
2846 }
2847
2848 // deprecated
2849 bool Context::create(int dtype)
2850 {
2851     if( !haveOpenCL() )
2852         return false;
2853     release();
2854     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2855     {
2856         p = Impl::findOrCreateContext("");
2857     }
2858     else if (dtype == CL_DEVICE_TYPE_GPU)
2859     {
2860         p = Impl::findOrCreateContext(":GPU:");
2861     }
2862     else if (dtype == CL_DEVICE_TYPE_CPU)
2863     {
2864         p = Impl::findOrCreateContext(":CPU:");
2865     }
2866     else
2867     {
2868         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2869     }
2870     if (p && !p->handle)
2871     {
2872         release();
2873     }
2874     return p != 0;
2875 }
2876
2877 Context::Context(const Context& c)
2878 {
2879     p = (Impl*)c.p;
2880     if(p)
2881         p->addref();
2882 }
2883
2884 Context& Context::operator = (const Context& c)
2885 {
2886     Impl* newp = (Impl*)c.p;
2887     if(newp)
2888         newp->addref();
2889     if(p)
2890         p->release();
2891     p = newp;
2892     return *this;
2893 }
2894
2895 void* Context::ptr() const
2896 {
2897     return p == NULL ? NULL : p->handle;
2898 }
2899
2900 size_t Context::ndevices() const
2901 {
2902     return p ? p->devices.size() : 0;
2903 }
2904
2905 Device& Context::device(size_t idx) const
2906 {
2907     static Device dummy;
2908     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2909 }
2910
2911 Context& Context::getDefault(bool initialize)
2912 {
2913     auto& c = OpenCLExecutionContext::getCurrent();
2914     if (!c.empty())
2915     {
2916         auto& ctx = c.getContext();
2917         return ctx;
2918     }
2919
2920     CV_UNUSED(initialize);
2921     static Context dummy;
2922     return dummy;
2923 }
2924
2925 Program Context::getProg(const ProgramSource& prog,
2926                          const String& buildopts, String& errmsg)
2927 {
2928     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2929 }
2930
2931 void Context::unloadProg(Program& prog)
2932 {
2933     if (p)
2934         p->unloadProg(prog);
2935 }
2936
2937 /* static */
2938 Context Context::fromHandle(void* context)
2939 {
2940     Context ctx;
2941     ctx.p = Impl::findOrCreateContext((cl_context)context);
2942     return ctx;
2943 }
2944
2945 /* static */
2946 Context Context::fromDevice(const ocl::Device& device)
2947 {
2948     Context ctx;
2949     ctx.p = Impl::findOrCreateContext(device);
2950     return ctx;
2951 }
2952
2953 /* static */
2954 Context Context::create(const std::string& configuration)
2955 {
2956     Context ctx;
2957     ctx.p = Impl::findOrCreateContext(configuration);
2958     return ctx;
2959 }
2960
2961 #ifdef HAVE_OPENCL_SVM
2962 bool Context::useSVM() const
2963 {
2964     Context::Impl* i = p;
2965     CV_Assert(i);
2966     if (!i->svmInitialized)
2967         i->svmInit();
2968     return i->svmEnabled;
2969 }
2970 void Context::setUseSVM(bool enabled)
2971 {
2972     Context::Impl* i = p;
2973     CV_Assert(i);
2974     if (!i->svmInitialized)
2975         i->svmInit();
2976     if (enabled && !i->svmAvailable)
2977     {
2978         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2979     }
2980     i->svmEnabled = enabled;
2981 }
2982 #else
2983 bool Context::useSVM() const { return false; }
2984 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2985 #endif
2986
2987 #ifdef HAVE_OPENCL_SVM
2988 namespace svm {
2989
2990 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2991 {
2992     Context::Impl* i = context.p;
2993     CV_Assert(i);
2994     if (!i->svmInitialized)
2995         i->svmInit();
2996     return i->svmCapabilities;
2997 }
2998
2999 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3000 {
3001     Context::Impl* i = context.p;
3002     CV_Assert(i);
3003     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3004     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3005     return &i->svmFunctions;
3006 }
3007
3008 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3009 {
3010     if (checkForceSVMUmatUsage())
3011         return true;
3012     if (checkDisableSVMUMatUsage())
3013         return false;
3014     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3015         return true;
3016     return false; // don't use SVM by default
3017 }
3018
3019 } // namespace cv::ocl::svm
3020 #endif // HAVE_OPENCL_SVM
3021
3022
3023 static void get_platform_name(cl_platform_id id, String& name)
3024 {
3025     // get platform name string length
3026     size_t sz = 0;
3027     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3028
3029     // get platform name string
3030     AutoBuffer<char> buf(sz + 1);
3031     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3032
3033     // just in case, ensure trailing zero for ASCIIZ string
3034     buf[sz] = 0;
3035
3036     name = buf.data();
3037 }
3038
3039 /*
3040 // Attaches OpenCL context to OpenCV
3041 */
3042 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3043 {
3044     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3045     ctx.bind();
3046 }
3047
3048 /* static */
3049 OpenCLExecutionContext OpenCLExecutionContext::create(
3050         const std::string& platformName, void* platformID, void* context, void* deviceID
3051 )
3052 {
3053     if (!haveOpenCL())
3054         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3055
3056     cl_uint cnt = 0;
3057     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3058
3059     if (cnt == 0)
3060         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3061
3062     std::vector<cl_platform_id> platforms(cnt);
3063
3064     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3065
3066     bool platformAvailable = false;
3067
3068     // check if external platformName contained in list of available platforms in OpenCV
3069     for (unsigned int i = 0; i < cnt; i++)
3070     {
3071         String availablePlatformName;
3072         get_platform_name(platforms[i], availablePlatformName);
3073         // external platform is found in the list of available platforms
3074         if (platformName == availablePlatformName)
3075         {
3076             platformAvailable = true;
3077             break;
3078         }
3079     }
3080
3081     if (!platformAvailable)
3082         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3083
3084     // check if platformID corresponds to platformName
3085     String actualPlatformName;
3086     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3087     if (platformName != actualPlatformName)
3088         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3089
3090     OpenCLExecutionContext ctx;
3091     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3092     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3093     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3094     return ctx;
3095 }
3096
3097 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3098 {
3099     // internal call, less checks
3100     cl_platform_id platformID = (cl_platform_id)_platform;
3101     cl_context context = (cl_context)_context;
3102     cl_device_id deviceID = (cl_device_id)_device;
3103
3104     std::string platformName = PlatformInfo(platformID).name();
3105
3106     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3107     CV_Assert(!clExecCtx.empty());
3108     ctx = clExecCtx.getContext();
3109 }
3110
3111 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3112
3113 struct Queue::Impl
3114 {
3115     inline void __init()
3116     {
3117         refcount = 1;
3118         handle = 0;
3119         isProfilingQueue_ = false;
3120     }
3121
3122     Impl(cl_command_queue q)
3123     {
3124         __init();
3125         handle = q;
3126
3127         cl_command_queue_properties props = 0;
3128         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3129         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3130     }
3131
3132     Impl(cl_command_queue q, bool isProfilingQueue)
3133     {
3134         __init();
3135         handle = q;
3136         isProfilingQueue_ = isProfilingQueue;
3137     }
3138
3139     Impl(const Context& c, const Device& d, bool withProfiling = false)
3140     {
3141         __init();
3142
3143         const Context* pc = &c;
3144         cl_context ch = (cl_context)pc->ptr();
3145         if( !ch )
3146         {
3147             pc = &Context::getDefault();
3148             ch = (cl_context)pc->ptr();
3149         }
3150         cl_device_id dh = (cl_device_id)d.ptr();
3151         if( !dh )
3152             dh = (cl_device_id)pc->device(0).ptr();
3153         cl_int retval = 0;
3154         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3155         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3156         isProfilingQueue_ = withProfiling;
3157     }
3158
3159     ~Impl()
3160     {
3161 #ifdef _WIN32
3162         if (!cv::__termination)
3163 #endif
3164         {
3165             if(handle)
3166             {
3167                 CV_OCL_DBG_CHECK(clFinish(handle));
3168                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3169                 handle = NULL;
3170             }
3171         }
3172     }
3173
3174     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3175     {
3176         if (isProfilingQueue_)
3177             return self;
3178
3179         if (profiling_queue_.ptr())
3180             return profiling_queue_;
3181
3182         cl_context ctx = 0;
3183         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3184
3185         cl_device_id device = 0;
3186         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3187
3188         cl_int result = CL_SUCCESS;
3189         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3190         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3191         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3192
3193         Queue queue;
3194         queue.p = new Impl(q, true);
3195         profiling_queue_ = queue;
3196
3197         return profiling_queue_;
3198     }
3199
3200     IMPLEMENT_REFCOUNTABLE();
3201
3202     cl_command_queue handle;
3203     bool isProfilingQueue_;
3204     cv::ocl::Queue profiling_queue_;
3205 };
3206
3207 Queue::Queue()
3208 {
3209     p = 0;
3210 }
3211
3212 Queue::Queue(const Context& c, const Device& d)
3213 {
3214     p = 0;
3215     create(c, d);
3216 }
3217
3218 Queue::Queue(const Queue& q)
3219 {
3220     p = q.p;
3221     if(p)
3222         p->addref();
3223 }
3224
3225 Queue& Queue::operator = (const Queue& q)
3226 {
3227     Impl* newp = (Impl*)q.p;
3228     if(newp)
3229         newp->addref();
3230     if(p)
3231         p->release();
3232     p = newp;
3233     return *this;
3234 }
3235
3236 Queue::~Queue()
3237 {
3238     if(p)
3239         p->release();
3240 }
3241
3242 bool Queue::create(const Context& c, const Device& d)
3243 {
3244     if(p)
3245         p->release();
3246     p = new Impl(c, d);
3247     return p->handle != 0;
3248 }
3249
3250 void Queue::finish()
3251 {
3252     if(p && p->handle)
3253     {
3254         CV_OCL_DBG_CHECK(clFinish(p->handle));
3255     }
3256 }
3257
3258 const Queue& Queue::getProfilingQueue() const
3259 {
3260     CV_Assert(p);
3261     return p->getProfilingQueue(*this);
3262 }
3263
3264 void* Queue::ptr() const
3265 {
3266     return p ? p->handle : 0;
3267 }
3268
3269 Queue& Queue::getDefault()
3270 {
3271     auto& c = OpenCLExecutionContext::getCurrent();
3272     if (!c.empty())
3273     {
3274         auto& q = c.getQueue();
3275         return q;
3276     }
3277     static Queue dummy;
3278     return dummy;
3279 }
3280
3281 static cl_command_queue getQueue(const Queue& q)
3282 {
3283     cl_command_queue qq = (cl_command_queue)q.ptr();
3284     if(!qq)
3285         qq = (cl_command_queue)Queue::getDefault().ptr();
3286     return qq;
3287 }
3288
3289 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3290
3291 KernelArg::KernelArg()
3292     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3293 {
3294 }
3295
3296 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3297     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3298 {
3299     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3300 }
3301
3302 KernelArg KernelArg::Constant(const Mat& m)
3303 {
3304     CV_Assert(m.isContinuous());
3305     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3306 }
3307
3308 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3309
3310 struct Kernel::Impl
3311 {
3312     Impl(const char* kname, const Program& prog) :
3313         refcount(1), handle(NULL), isInProgress(false), nu(0)
3314     {
3315         cl_program ph = (cl_program)prog.ptr();
3316         cl_int retval = 0;
3317         name = kname;
3318         if (ph)
3319         {
3320             handle = clCreateKernel(ph, kname, &retval);
3321             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3322         }
3323         for( int i = 0; i < MAX_ARRS; i++ )
3324             u[i] = 0;
3325         haveTempDstUMats = false;
3326         haveTempSrcUMats = false;
3327     }
3328
3329     void cleanupUMats()
3330     {
3331         for( int i = 0; i < MAX_ARRS; i++ )
3332             if( u[i] )
3333             {
3334                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3335                 {
3336                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3337                     u[i]->currAllocator->deallocate(u[i]);
3338                 }
3339                 u[i] = 0;
3340             }
3341         nu = 0;
3342         haveTempDstUMats = false;
3343         haveTempSrcUMats = false;
3344     }
3345
3346     void addUMat(const UMat& m, bool dst)
3347     {
3348         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3349         u[nu] = m.u;
3350         CV_XADD(&m.u->urefcount, 1);
3351         nu++;
3352         if(dst && m.u->tempUMat())
3353             haveTempDstUMats = true;
3354         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3355             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3356     }
3357
3358     void addImage(const Image2D& image)
3359     {
3360         images.push_back(image);
3361     }
3362
3363     void finit(cl_event e)
3364     {
3365         CV_UNUSED(e);
3366         cleanupUMats();
3367         images.clear();
3368         isInProgress = false;
3369         release();
3370     }
3371
3372     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3373             bool sync, int64* timeNS, const Queue& q);
3374
3375     ~Impl()
3376     {
3377         if(handle)
3378         {
3379             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3380         }
3381     }
3382
3383     IMPLEMENT_REFCOUNTABLE();
3384
3385     cv::String name;
3386     cl_kernel handle;
3387     enum { MAX_ARRS = 16 };
3388     UMatData* u[MAX_ARRS];
3389     bool isInProgress;
3390     int nu;
3391     std::list<Image2D> images;
3392     bool haveTempDstUMats;
3393     bool haveTempSrcUMats;
3394 };
3395
3396 }} // namespace cv::ocl
3397
3398 extern "C" {
3399
3400 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3401 {
3402     try
3403     {
3404         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3405     }
3406     catch (const cv::Exception& exc)
3407     {
3408         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3409     }
3410     catch (const std::exception& exc)
3411     {
3412         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3413     }
3414     catch (...)
3415     {
3416         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3417     }
3418 }
3419
3420 }
3421
3422 namespace cv { namespace ocl {
3423
3424 Kernel::Kernel()
3425 {
3426     p = 0;
3427 }
3428
3429 Kernel::Kernel(const char* kname, const Program& prog)
3430 {
3431     p = 0;
3432     create(kname, prog);
3433 }
3434
3435 Kernel::Kernel(const char* kname, const ProgramSource& src,
3436                const String& buildopts, String* errmsg)
3437 {
3438     p = 0;
3439     create(kname, src, buildopts, errmsg);
3440 }
3441
3442 Kernel::Kernel(const Kernel& k)
3443 {
3444     p = k.p;
3445     if(p)
3446         p->addref();
3447 }
3448
3449 Kernel& Kernel::operator = (const Kernel& k)
3450 {
3451     Impl* newp = (Impl*)k.p;
3452     if(newp)
3453         newp->addref();
3454     if(p)
3455         p->release();
3456     p = newp;
3457     return *this;
3458 }
3459
3460 Kernel::~Kernel()
3461 {
3462     if(p)
3463         p->release();
3464 }
3465
3466 bool Kernel::create(const char* kname, const Program& prog)
3467 {
3468     if(p)
3469         p->release();
3470     p = new Impl(kname, prog);
3471     if(p->handle == 0)
3472     {
3473         p->release();
3474         p = 0;
3475     }
3476 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3477     CV_Assert(p);
3478 #endif
3479     return p != 0;
3480 }
3481
3482 bool Kernel::create(const char* kname, const ProgramSource& src,
3483                     const String& buildopts, String* errmsg)
3484 {
3485     if(p)
3486     {
3487         p->release();
3488         p = 0;
3489     }
3490     String tempmsg;
3491     if( !errmsg ) errmsg = &tempmsg;
3492     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3493     return create(kname, prog);
3494 }
3495
3496 void* Kernel::ptr() const
3497 {
3498     return p ? p->handle : 0;
3499 }
3500
3501 bool Kernel::empty() const
3502 {
3503     return ptr() == 0;
3504 }
3505
3506 int Kernel::set(int i, const void* value, size_t sz)
3507 {
3508     if (!p || !p->handle)
3509         return -1;
3510     if (i < 0)
3511         return i;
3512     if( i == 0 )
3513         p->cleanupUMats();
3514
3515     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3516     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
3517     if (retval != CL_SUCCESS)
3518         return -1;
3519     return i+1;
3520 }
3521
3522 int Kernel::set(int i, const Image2D& image2D)
3523 {
3524     p->addImage(image2D);
3525     cl_mem h = (cl_mem)image2D.ptr();
3526     return set(i, &h, sizeof(h));
3527 }
3528
3529 int Kernel::set(int i, const UMat& m)
3530 {
3531     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3532 }
3533
3534 int Kernel::set(int i, const KernelArg& arg)
3535 {
3536     if( !p || !p->handle )
3537         return -1;
3538     if (i < 0)
3539     {
3540         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3541                 p->name.c_str(), (int)i));
3542         return i;
3543     }
3544     if( i == 0 )
3545         p->cleanupUMats();
3546     cl_int status = 0;
3547     if( arg.m )
3548     {
3549         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3550                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3551         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3552         if (ptronly && arg.m->empty())
3553         {
3554             cl_mem h_null = (cl_mem)NULL;
3555             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3556             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3557             return i + 1;
3558         }
3559         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3560
3561         if (!h)
3562         {
3563             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3564                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3565             p->release();
3566             p = 0;
3567             return -1;
3568         }
3569
3570 #ifdef HAVE_OPENCL_SVM
3571         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3572         {
3573             const Context& ctx = Context::getDefault();
3574             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3575             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3576             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3577 #if 1 // TODO
3578             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3579 #else
3580             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3581 #endif
3582             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3583         }
3584         else
3585 #endif
3586         {
3587             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3588             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3589         }
3590
3591         if (ptronly)
3592         {
3593             i++;
3594         }
3595         else if( arg.m->dims <= 2 )
3596         {
3597             UMat2D u2d(*arg.m);
3598             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3599             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3600             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3601             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3602             i += 3;
3603
3604             if( !(arg.flags & KernelArg::NO_SIZE) )
3605             {
3606                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3607                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3608                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3609                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3610                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3611                 i += 2;
3612             }
3613         }
3614         else
3615         {
3616             UMat3D u3d(*arg.m);
3617             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3618             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3619             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3620             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3621             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3622             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3623             i += 4;
3624             if( !(arg.flags & KernelArg::NO_SIZE) )
3625             {
3626                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3627                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3628                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3629                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3630                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3631                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3632                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3633                 i += 3;
3634             }
3635         }
3636         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3637         return i;
3638     }
3639     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3640     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3641     return i+1;
3642 }
3643
3644 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3645                  bool sync, const Queue& q)
3646 {
3647     if (!p)
3648         return false;
3649
3650     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3651     size_t total = 1;
3652     CV_Assert(_globalsize != NULL);
3653     for (int i = 0; i < dims; i++)
3654     {
3655         size_t val = _localsize ? _localsize[i] :
3656             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3657         CV_Assert( val > 0 );
3658         total *= _globalsize[i];
3659         if (_globalsize[i] == 1 && !_localsize)
3660             val = 1;
3661         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3662     }
3663     CV_Assert(total > 0);
3664
3665     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3666 }
3667
3668
3669 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3670         bool sync, int64* timeNS, const Queue& q)
3671 {
3672     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3673
3674     if (!handle || isInProgress)
3675         return false;
3676
3677     cl_command_queue qq = getQueue(q);
3678     if (haveTempDstUMats)
3679         sync = true;
3680     if (haveTempSrcUMats)
3681         sync = true;
3682     if (timeNS)
3683         sync = true;
3684     cl_event asyncEvent = 0;
3685     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3686                                            NULL, globalsize, localsize, 0, 0,
3687                                            (sync && !timeNS) ? 0 : &asyncEvent);
3688 #if !CV_OPENCL_SHOW_RUN_KERNELS
3689     if (retval != CL_SUCCESS)
3690 #endif
3691     {
3692         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3693                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3694                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3695                         sync ? "true" : "false"
3696                         );
3697         if (retval != CL_SUCCESS)
3698         {
3699             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3700         }
3701 #if CV_OPENCL_TRACE_CHECK
3702         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3703 #else
3704         printf("%s\n", msg.c_str());
3705         fflush(stdout);
3706 #endif
3707     }
3708     if (sync || retval != CL_SUCCESS)
3709     {
3710         CV_OCL_DBG_CHECK(clFinish(qq));
3711         if (timeNS)
3712         {
3713             if (retval == CL_SUCCESS)
3714             {
3715                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3716                 cl_ulong startTime, stopTime;
3717                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3718                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3719                 *timeNS = (int64)(stopTime - startTime);
3720             }
3721             else
3722             {
3723                 *timeNS = -1;
3724             }
3725         }
3726         cleanupUMats();
3727     }
3728     else
3729     {
3730         addref();
3731         isInProgress = true;
3732         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3733     }
3734     if (asyncEvent)
3735         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3736     return retval == CL_SUCCESS;
3737 }
3738
3739 bool Kernel::runTask(bool sync, const Queue& q)
3740 {
3741     if(!p || !p->handle || p->isInProgress)
3742         return false;
3743
3744     cl_command_queue qq = getQueue(q);
3745     cl_event asyncEvent = 0;
3746     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3747     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3748     if (sync || retval != CL_SUCCESS)
3749     {
3750         CV_OCL_DBG_CHECK(clFinish(qq));
3751         p->cleanupUMats();
3752     }
3753     else
3754     {
3755         p->addref();
3756         p->isInProgress = true;
3757         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3758     }
3759     if (asyncEvent)
3760         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3761     return retval == CL_SUCCESS;
3762 }
3763
3764 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3765 {
3766     CV_Assert(p && p->handle && !p->isInProgress);
3767     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3768     CV_Assert(q.ptr());
3769     q.finish(); // call clFinish() on base queue
3770     Queue profilingQueue = q.getProfilingQueue();
3771     int64 timeNs = -1;
3772     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3773     return res ? timeNs : -1;
3774 }
3775
3776 size_t Kernel::workGroupSize() const
3777 {
3778     if(!p || !p->handle)
3779         return 0;
3780     size_t val = 0, retsz = 0;
3781     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3782     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3783     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3784     return status == CL_SUCCESS ? val : 0;
3785 }
3786
3787 size_t Kernel::preferedWorkGroupSizeMultiple() const
3788 {
3789     if(!p || !p->handle)
3790         return 0;
3791     size_t val = 0, retsz = 0;
3792     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3793     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3794     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3795     return status == CL_SUCCESS ? val : 0;
3796 }
3797
3798 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3799 {
3800     if(!p || !p->handle || !wsz)
3801         return 0;
3802     size_t retsz = 0;
3803     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3804     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3805     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3806     return status == CL_SUCCESS;
3807 }
3808
3809 size_t Kernel::localMemSize() const
3810 {
3811     if(!p || !p->handle)
3812         return 0;
3813     size_t retsz = 0;
3814     cl_ulong val = 0;
3815     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3816     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3817     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3818     return status == CL_SUCCESS ? (size_t)val : 0;
3819 }
3820
3821
3822
3823 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3824
3825 struct ProgramSource::Impl
3826 {
3827     IMPLEMENT_REFCOUNTABLE();
3828
3829     enum KIND {
3830         PROGRAM_SOURCE_CODE = 0,
3831         PROGRAM_BINARIES,
3832         PROGRAM_SPIR,
3833         PROGRAM_SPIRV
3834     } kind_;
3835
3836     Impl(const String& src)
3837     {
3838         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3839         initFromSource(src, cv::String());
3840     }
3841     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3842     {
3843         init(PROGRAM_SOURCE_CODE, module, name);
3844         initFromSource(codeStr, codeHash);
3845     }
3846
3847     /// reset fields
3848     void init(enum KIND kind, const String& module, const String& name)
3849     {
3850         refcount = 1;
3851         kind_ = kind;
3852         module_ = module;
3853         name_ = name;
3854
3855         sourceAddr_ = NULL;
3856         sourceSize_ = 0;
3857         isHashUpdated = false;
3858     }
3859
3860     void initFromSource(const String& codeStr, const String& codeHash)
3861     {
3862         codeStr_ = codeStr;
3863         sourceHash_ = codeHash;
3864         if (sourceHash_.empty())
3865         {
3866             updateHash();
3867         }
3868         else
3869         {
3870             isHashUpdated = true;
3871         }
3872     }
3873
3874     void updateHash(const char* hashStr = NULL)
3875     {
3876         if (hashStr)
3877         {
3878             sourceHash_ = cv::String(hashStr);
3879             isHashUpdated = true;
3880             return;
3881         }
3882         uint64 hash = 0;
3883         switch (kind_)
3884         {
3885         case PROGRAM_SOURCE_CODE:
3886             if (sourceAddr_)
3887             {
3888                 CV_Assert(codeStr_.empty());
3889                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3890             }
3891             else
3892             {
3893                 CV_Assert(!codeStr_.empty());
3894                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3895             }
3896             break;
3897         case PROGRAM_BINARIES:
3898         case PROGRAM_SPIR:
3899         case PROGRAM_SPIRV:
3900             hash = crc64(sourceAddr_, sourceSize_);
3901             break;
3902         default:
3903             CV_Error(Error::StsInternal, "Internal error");
3904         }
3905         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3906         isHashUpdated = true;
3907     }
3908
3909     Impl(enum KIND kind,
3910             const String& module, const String& name,
3911             const unsigned char* binary, const size_t size,
3912             const cv::String& buildOptions = cv::String())
3913     {
3914         init(kind, module, name);
3915
3916         sourceAddr_ = binary;
3917         sourceSize_ = size;
3918
3919         buildOptions_ = buildOptions;
3920     }
3921
3922     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3923             const char* sourceCodeStaticStr, const char* hashStaticStr,
3924             const cv::String& buildOptions)
3925     {
3926         ProgramSource result;
3927         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3928                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3929         result.p->updateHash(hashStaticStr);
3930         return result;
3931     }
3932
3933     static ProgramSource fromBinary(const String& module, const String& name,
3934             const unsigned char* binary, const size_t size,
3935             const cv::String& buildOptions)
3936     {
3937         ProgramSource result;
3938         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3939         return result;
3940     }
3941
3942     static ProgramSource fromSPIR(const String& module, const String& name,
3943             const unsigned char* binary, const size_t size,
3944             const cv::String& buildOptions)
3945     {
3946         ProgramSource result;
3947         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3948         return result;
3949     }
3950
3951     String module_;
3952     String name_;
3953
3954     // TODO std::vector<ProgramSource> includes_;
3955     String codeStr_; // PROGRAM_SOURCE_CODE only
3956
3957     const unsigned char* sourceAddr_;
3958     size_t sourceSize_;
3959
3960     cv::String buildOptions_;
3961
3962     String sourceHash_;
3963     bool isHashUpdated;
3964
3965     friend struct Program::Impl;
3966     friend struct internal::ProgramEntry;
3967     friend struct Context::Impl;
3968 };
3969
3970
3971 ProgramSource::ProgramSource()
3972 {
3973     p = 0;
3974 }
3975
3976 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
3977 {
3978     p = new Impl(module, name, codeStr, codeHash);
3979 }
3980
3981 ProgramSource::ProgramSource(const char* prog)
3982 {
3983     p = new Impl(prog);
3984 }
3985
3986 ProgramSource::ProgramSource(const String& prog)
3987 {
3988     p = new Impl(prog);
3989 }
3990
3991 ProgramSource::~ProgramSource()
3992 {
3993     if(p)
3994         p->release();
3995 }
3996
3997 ProgramSource::ProgramSource(const ProgramSource& prog)
3998 {
3999     p = prog.p;
4000     if(p)
4001         p->addref();
4002 }
4003
4004 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4005 {
4006     Impl* newp = (Impl*)prog.p;
4007     if(newp)
4008         newp->addref();
4009     if(p)
4010         p->release();
4011     p = newp;
4012     return *this;
4013 }
4014
4015 const String& ProgramSource::source() const
4016 {
4017     CV_Assert(p);
4018     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4019     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4020     return p->codeStr_;
4021 }
4022
4023 ProgramSource::hash_t ProgramSource::hash() const
4024 {
4025     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4026 }
4027
4028 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4029         const unsigned char* binary, const size_t size,
4030         const cv::String& buildOptions)
4031 {
4032     CV_Assert(binary);
4033     CV_Assert(size > 0);
4034     return Impl::fromBinary(module, name, binary, size, buildOptions);
4035 }
4036
4037 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4038         const unsigned char* binary, const size_t size,
4039         const cv::String& buildOptions)
4040 {
4041     CV_Assert(binary);
4042     CV_Assert(size > 0);
4043     return Impl::fromBinary(module, name, binary, size, buildOptions);
4044 }
4045
4046
4047 internal::ProgramEntry::operator ProgramSource&() const
4048 {
4049     if (this->pProgramSource == NULL)
4050     {
4051         cv::AutoLock lock(cv::getInitializationMutex());
4052         if (this->pProgramSource == NULL)
4053         {
4054             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4055             ProgramSource* ptr = new ProgramSource(ps);
4056             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4057         }
4058     }
4059     return *this->pProgramSource;
4060 }
4061
4062
4063
4064 /////////////////////////////////////////// Program /////////////////////////////////////////////
4065
4066 static
4067 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4068 {
4069     if (b.empty())
4070         return a;
4071     if (a.empty())
4072         return b;
4073     if (b[0] == ' ')
4074         return a + b;
4075     return a + (cv::String(" ") + b);
4076 }
4077
4078 struct Program::Impl
4079 {
4080     IMPLEMENT_REFCOUNTABLE();
4081
4082     Impl(const ProgramSource& src,
4083          const String& _buildflags, String& errmsg) :
4084          refcount(1),
4085          handle(NULL),
4086          buildflags(_buildflags)
4087     {
4088         const ProgramSource::Impl* src_ = src.getImpl();
4089         CV_Assert(src_);
4090         sourceModule_ = src_->module_;
4091         sourceName_ = src_->name_;
4092         const Context ctx = Context::getDefault();
4093         Device device = ctx.device(0);
4094         if (ctx.ptr() == NULL || device.ptr() == NULL)
4095             return;
4096         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4097         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4098         {
4099             if (device.isAMD())
4100                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4101             else if (device.isIntel())
4102                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4103             const String param_buildExtraOptions = getBuildExtraOptions();
4104             if (!param_buildExtraOptions.empty())
4105                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4106         }
4107         compile(ctx, src_, errmsg);
4108     }
4109
4110     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4111     {
4112         CV_Assert(ctx.getImpl());
4113         CV_Assert(src_);
4114
4115         // We don't cache OpenCL binaries
4116         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4117         {
4118             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4119             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4120             return isLoaded;
4121         }
4122         return compileWithCache(ctx, src_, errmsg);
4123     }
4124
4125     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4126     {
4127         CV_Assert(ctx.getImpl());
4128         CV_Assert(src_);
4129         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4130
4131 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4132         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4133         const std::string base_dir = config.prepareCacheDirectoryForContext(
4134                 ctx.getImpl()->getPrefixString(),
4135                 ctx.getImpl()->getPrefixBase()
4136         );
4137         const String& hash_str = src_->sourceHash_;
4138         cv::String fname;
4139         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4140         {
4141             CV_Assert(!hash_str.empty());
4142             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4143             fname = utils::fs::join(base_dir, fname);
4144         }
4145         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4146         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4147         {
4148             try
4149             {
4150                 std::vector<char> binaryBuf;
4151                 bool res = false;
4152                 {
4153                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4154                     BinaryProgramFile file(fname, hash_str.c_str());
4155                     res = file.read(buildflags, binaryBuf);
4156                 }
4157                 if (res)
4158                 {
4159                     CV_Assert(!binaryBuf.empty());
4160                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4161                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4162                     if (isLoaded)
4163                         return true;
4164                 }
4165             }
4166             catch (const cv::Exception& e)
4167             {
4168                 CV_UNUSED(e);
4169                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4170             }
4171             catch (...)
4172             {
4173                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4174             }
4175         }
4176 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4177         CV_Assert(handle == NULL);
4178         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4179         {
4180             if (!buildFromSources(ctx, src_, errmsg))
4181             {
4182                 return false;
4183             }
4184         }
4185         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4186         {
4187             buildflags = joinBuildOptions(buildflags, " -x spir");
4188             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4189             {
4190                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4191             }
4192             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4193             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4194             if (!isLoaded)
4195                 return false;
4196         }
4197         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4198         {
4199             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4200         }
4201         else
4202         {
4203             CV_Error(Error::StsInternal, "Internal error");
4204         }
4205         CV_Assert(handle != NULL);
4206 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4207         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4208         {
4209             try
4210             {
4211                 std::vector<char> binaryBuf;
4212                 getProgramBinary(binaryBuf);
4213                 {
4214                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4215                     BinaryProgramFile file(fname, hash_str.c_str());
4216                     file.write(buildflags, binaryBuf);
4217                 }
4218             }
4219             catch (const cv::Exception& e)
4220             {
4221                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4222             }
4223             catch (...)
4224             {
4225                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4226             }
4227         }
4228 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4229 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4230         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4231         {
4232             std::vector<char> binaryBuf;
4233             getProgramBinary(binaryBuf);
4234             if (!binaryBuf.empty())
4235             {
4236                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4237                 handle = NULL;
4238                 createFromBinary(ctx, binaryBuf, errmsg);
4239             }
4240         }
4241 #endif
4242         return handle != NULL;
4243     }
4244
4245     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4246     {
4247         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4248
4249         size_t retsz = 0;
4250         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4251                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4252         if (log_retval == CL_SUCCESS && retsz > 1)
4253         {
4254             buffer.resize(retsz + 16);
4255             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4256                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4257             if (log_retval == CL_SUCCESS)
4258             {
4259                 if (retsz < buffer.size())
4260                     buffer[retsz] = 0;
4261                 else
4262                     buffer[buffer.size() - 1] = 0;
4263             }
4264             else
4265             {
4266                 buffer[0] = 0;
4267             }
4268         }
4269
4270         errmsg = String(buffer.data());
4271         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4272                 sourceModule_.c_str(), sourceName_.c_str(),
4273                 result, getOpenCLErrorString(result),
4274                 buildflags.c_str(), errmsg.c_str());
4275         fflush(stdout);
4276     }
4277
4278     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4279     {
4280         CV_Assert(src_);
4281         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4282         CV_Assert(handle == NULL);
4283         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4284                 sourceModule_.c_str(), sourceName_.c_str(),
4285                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4286
4287         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4288
4289         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4290         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4291         CV_Assert(srcptr != NULL);
4292         CV_Assert(srclen > 0);
4293
4294         cl_int retval = 0;
4295
4296         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4297         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4298         CV_Assert(handle || retval != CL_SUCCESS);
4299         if (handle && retval == CL_SUCCESS)
4300         {
4301             size_t n = ctx.ndevices();
4302             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4303             cl_device_id* deviceList = deviceListBuf.data();
4304             for (size_t i = 0; i < n; i++)
4305             {
4306                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4307             }
4308
4309             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4310             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4311 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4312             if (retval != CL_SUCCESS)
4313 #endif
4314             {
4315                 dumpBuildLog_(retval, deviceList, errmsg);
4316
4317                 // don't remove "retval != CL_SUCCESS" condition here:
4318                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4319                 if (retval != CL_SUCCESS && handle)
4320                 {
4321                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4322                     handle = NULL;
4323                 }
4324             }
4325 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4326             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4327             {
4328                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4329                 size_t retsz = 0;
4330                 char kernels_buffer[4096] = {0};
4331                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4332                 if (retsz < sizeof(kernels_buffer))
4333                     kernels_buffer[retsz] = 0;
4334                 else
4335                     kernels_buffer[0] = 0;
4336                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4337             }
4338 #endif
4339
4340         }
4341         return handle != NULL;
4342     }
4343
4344     void getProgramBinary(std::vector<char>& buf)
4345     {
4346         CV_Assert(handle);
4347         size_t sz = 0;
4348         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4349         buf.resize(sz);
4350         uchar* ptr = (uchar*)&buf[0];
4351         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4352     }
4353
4354     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4355     {
4356         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4357     }
4358
4359     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4360     {
4361         CV_Assert(handle == NULL);
4362         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4363         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4364
4365         CV_Assert(binarySize > 0);
4366
4367         size_t ndevices = (int)ctx.ndevices();
4368         AutoBuffer<cl_device_id> devices_(ndevices);
4369         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4370         AutoBuffer<size_t> binarySizes_(ndevices);
4371
4372         cl_device_id* devices = devices_.data();
4373         const uchar** binaryPtrs = binaryPtrs_.data();
4374         size_t* binarySizes = binarySizes_.data();
4375         for (size_t i = 0; i < ndevices; i++)
4376         {
4377             devices[i] = (cl_device_id)ctx.device(i).ptr();
4378             binaryPtrs[i] = binaryAddr;
4379             binarySizes[i] = binarySize;
4380         }
4381
4382         cl_int result = 0;
4383         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4384                                            binarySizes, binaryPtrs, NULL, &result);
4385         if (result != CL_SUCCESS)
4386         {
4387             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4388             if (handle)
4389             {
4390                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4391                 handle = NULL;
4392             }
4393         }
4394         if (!handle)
4395         {
4396             return false;
4397         }
4398         // call clBuildProgram()
4399         {
4400             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4401             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4402             if (result != CL_SUCCESS)
4403             {
4404                 dumpBuildLog_(result, devices, errmsg);
4405                 if (handle)
4406                 {
4407                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4408                     handle = NULL;
4409                 }
4410                 return false;
4411             }
4412         }
4413         // check build status
4414         {
4415             cl_build_status build_status = CL_BUILD_NONE;
4416             size_t retsz = 0;
4417             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4418                     sizeof(build_status), &build_status, &retsz));
4419             if (result == CL_SUCCESS)
4420             {
4421                 if (build_status == CL_BUILD_SUCCESS)
4422                 {
4423                     return true;
4424                 }
4425                 else
4426                 {
4427                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4428                     return false;
4429                 }
4430             }
4431             else
4432             {
4433                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4434                 if (handle)
4435                 {
4436                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4437                     handle = NULL;
4438                 }
4439             }
4440         }
4441 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4442         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4443         {
4444             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4445             size_t retsz = 0;
4446             char kernels_buffer[4096] = {0};
4447             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4448             if (retsz < sizeof(kernels_buffer))
4449                 kernels_buffer[retsz] = 0;
4450             else
4451                 kernels_buffer[0] = 0;
4452             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4453         }
4454 #endif
4455         return handle != NULL;
4456     }
4457
4458     ~Impl()
4459     {
4460         if( handle )
4461         {
4462 #ifdef _WIN32
4463             if (!cv::__termination)
4464 #endif
4465             {
4466                 clReleaseProgram(handle);
4467             }
4468             handle = NULL;
4469         }
4470     }
4471
4472     cl_program handle;
4473
4474     String buildflags;
4475     String sourceModule_;
4476     String sourceName_;
4477 };
4478
4479
4480 Program::Program() { p = 0; }
4481
4482 Program::Program(const ProgramSource& src,
4483         const String& buildflags, String& errmsg)
4484 {
4485     p = 0;
4486     create(src, buildflags, errmsg);
4487 }
4488
4489 Program::Program(const Program& prog)
4490 {
4491     p = prog.p;
4492     if(p)
4493         p->addref();
4494 }
4495
4496 Program& Program::operator = (const Program& prog)
4497 {
4498     Impl* newp = (Impl*)prog.p;
4499     if(newp)
4500         newp->addref();
4501     if(p)
4502         p->release();
4503     p = newp;
4504     return *this;
4505 }
4506
4507 Program::~Program()
4508 {
4509     if(p)
4510         p->release();
4511 }
4512
4513 bool Program::create(const ProgramSource& src,
4514             const String& buildflags, String& errmsg)
4515 {
4516     if(p)
4517     {
4518         p->release();
4519         p = NULL;
4520     }
4521     p = new Impl(src, buildflags, errmsg);
4522     if(!p->handle)
4523     {
4524         p->release();
4525         p = 0;
4526     }
4527     return p != 0;
4528 }
4529
4530 void* Program::ptr() const
4531 {
4532     return p ? p->handle : 0;
4533 }
4534
4535 #ifndef OPENCV_REMOVE_DEPRECATED_API
4536 const ProgramSource& Program::source() const
4537 {
4538     CV_Error(Error::StsNotImplemented, "Removed API");
4539 }
4540
4541 bool Program::read(const String& bin, const String& buildflags)
4542 {
4543     CV_UNUSED(bin); CV_UNUSED(buildflags);
4544     CV_Error(Error::StsNotImplemented, "Removed API");
4545 }
4546
4547 bool Program::write(String& bin) const
4548 {
4549     CV_UNUSED(bin);
4550     CV_Error(Error::StsNotImplemented, "Removed API");
4551 }
4552
4553 String Program::getPrefix() const
4554 {
4555     if(!p)
4556         return String();
4557     Context::Impl* ctx_ = Context::getDefault().getImpl();
4558     CV_Assert(ctx_);
4559     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4560 }
4561
4562 String Program::getPrefix(const String& buildflags)
4563 {
4564         Context::Impl* ctx_ = Context::getDefault().getImpl();
4565         CV_Assert(ctx_);
4566         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4567 }
4568 #endif // OPENCV_REMOVE_DEPRECATED_API
4569
4570 void Program::getBinary(std::vector<char>& binary) const
4571 {
4572     CV_Assert(p && "Empty program");
4573     p->getProgramBinary(binary);
4574 }
4575
4576 Program Context::Impl::getProg(const ProgramSource& src,
4577                                const String& buildflags, String& errmsg)
4578 {
4579     size_t limit = getProgramCountLimit();
4580     const ProgramSource::Impl* src_ = src.getImpl();
4581     CV_Assert(src_);
4582     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4583             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4584             getPrefixString().c_str(),
4585             buildflags.c_str());
4586     {
4587         cv::AutoLock lock(program_cache_mutex);
4588         phash_t::iterator it = phash.find(key);
4589         if (it != phash.end())
4590         {
4591             // TODO LRU cache
4592             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4593             if (i != cacheList.end() && i != cacheList.begin())
4594             {
4595                 cacheList.erase(i);
4596                 cacheList.push_front(key);
4597             }
4598             return it->second;
4599         }
4600         { // cleanup program cache
4601             size_t sz = phash.size();
4602             if (limit > 0 && sz >= limit)
4603             {
4604                 static bool warningFlag = false;
4605                 if (!warningFlag)
4606                 {
4607                     printf("\nWARNING: OpenCV-OpenCL:\n"
4608                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4609                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4610                     warningFlag = true;
4611                 }
4612                 while (!cacheList.empty())
4613                 {
4614                     size_t c = phash.erase(cacheList.back());
4615                     cacheList.pop_back();
4616                     if (c != 0)
4617                         break;
4618                 }
4619             }
4620         }
4621     }
4622     Program prog(src, buildflags, errmsg);
4623     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4624     {
4625         cv::AutoLock lock(program_cache_mutex);
4626         phash.insert(std::pair<std::string, Program>(key, prog));
4627         cacheList.push_front(key);
4628     }
4629     return prog;
4630 }
4631
4632
4633 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4634
4635 template<typename T>
4636 class OpenCLBufferPool
4637 {
4638 protected:
4639     ~OpenCLBufferPool() { }
4640 public:
4641     virtual T allocate(size_t size) = 0;
4642     virtual void release(T buffer) = 0;
4643 };
4644
4645 template <typename Derived, typename BufferEntry, typename T>
4646 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4647 {
4648 private:
4649     inline Derived& derived() { return *static_cast<Derived*>(this); }
4650 protected:
4651     Mutex mutex_;
4652
4653     size_t currentReservedSize;
4654     size_t maxReservedSize;
4655
4656     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4657     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4658
4659     // synchronized
4660     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4661     {
4662         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4663         for (; i != allocatedEntries_.end(); ++i)
4664         {
4665             BufferEntry& e = *i;
4666             if (e.clBuffer_ == buffer)
4667             {
4668                 entry = e;
4669                 allocatedEntries_.erase(i);
4670                 return true;
4671             }
4672         }
4673         return false;
4674     }
4675
4676     // synchronized
4677     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4678     {
4679         if (reservedEntries_.empty())
4680             return false;
4681         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4682         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4683         BufferEntry result;
4684         size_t minDiff = (size_t)(-1);
4685         for (; i != reservedEntries_.end(); ++i)
4686         {
4687             BufferEntry& e = *i;
4688             if (e.capacity_ >= size)
4689             {
4690                 size_t diff = e.capacity_ - size;
4691                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4692                 {
4693                     minDiff = diff;
4694                     result_pos = i;
4695                     result = e;
4696                     if (diff == 0)
4697                         break;
4698                 }
4699             }
4700         }
4701         if (result_pos != reservedEntries_.end())
4702         {
4703             //CV_DbgAssert(result == *result_pos);
4704             reservedEntries_.erase(result_pos);
4705             entry = result;
4706             currentReservedSize -= entry.capacity_;
4707             allocatedEntries_.push_back(entry);
4708             return true;
4709         }
4710         return false;
4711     }
4712
4713     // synchronized
4714     void _checkSizeOfReservedEntries()
4715     {
4716         while (currentReservedSize > maxReservedSize)
4717         {
4718             CV_DbgAssert(!reservedEntries_.empty());
4719             const BufferEntry& entry = reservedEntries_.back();
4720             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4721             currentReservedSize -= entry.capacity_;
4722             derived()._releaseBufferEntry(entry);
4723             reservedEntries_.pop_back();
4724         }
4725     }
4726
4727     inline size_t _allocationGranularity(size_t size)
4728     {
4729         // heuristic values
4730         if (size < 1024*1024)
4731             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4732         else if (size < 16*1024*1024)
4733             return 64*1024;
4734         else
4735             return 1024*1024;
4736     }
4737
4738 public:
4739     OpenCLBufferPoolBaseImpl()
4740         : currentReservedSize(0),
4741           maxReservedSize(0)
4742     {
4743         // nothing
4744     }
4745     virtual ~OpenCLBufferPoolBaseImpl()
4746     {
4747         freeAllReservedBuffers();
4748         CV_Assert(reservedEntries_.empty());
4749     }
4750 public:
4751     virtual T allocate(size_t size) CV_OVERRIDE
4752     {
4753         AutoLock locker(mutex_);
4754         BufferEntry entry;
4755         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4756         {
4757             CV_DbgAssert(size <= entry.capacity_);
4758             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4759         }
4760         else
4761         {
4762             derived()._allocateBufferEntry(entry, size);
4763         }
4764         return entry.clBuffer_;
4765     }
4766     virtual void release(T buffer) CV_OVERRIDE
4767     {
4768         AutoLock locker(mutex_);
4769         BufferEntry entry;
4770         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4771         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4772         {
4773             derived()._releaseBufferEntry(entry);
4774         }
4775         else
4776         {
4777             reservedEntries_.push_front(entry);
4778             currentReservedSize += entry.capacity_;
4779             _checkSizeOfReservedEntries();
4780         }
4781     }
4782
4783     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4784     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4785     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4786     {
4787         AutoLock locker(mutex_);
4788         size_t oldMaxReservedSize = maxReservedSize;
4789         maxReservedSize = size;
4790         if (maxReservedSize < oldMaxReservedSize)
4791         {
4792             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4793             for (; i != reservedEntries_.end();)
4794             {
4795                 const BufferEntry& entry = *i;
4796                 if (entry.capacity_ > maxReservedSize / 8)
4797                 {
4798                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4799                     currentReservedSize -= entry.capacity_;
4800                     derived()._releaseBufferEntry(entry);
4801                     i = reservedEntries_.erase(i);
4802                     continue;
4803                 }
4804                 ++i;
4805             }
4806             _checkSizeOfReservedEntries();
4807         }
4808     }
4809     virtual void freeAllReservedBuffers() CV_OVERRIDE
4810     {
4811         AutoLock locker(mutex_);
4812         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4813         for (; i != reservedEntries_.end(); ++i)
4814         {
4815             const BufferEntry& entry = *i;
4816             derived()._releaseBufferEntry(entry);
4817         }
4818         reservedEntries_.clear();
4819         currentReservedSize = 0;
4820     }
4821 };
4822
4823 struct CLBufferEntry
4824 {
4825     cl_mem clBuffer_;
4826     size_t capacity_;
4827     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4828 };
4829
4830 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4831 {
4832 public:
4833     typedef struct CLBufferEntry BufferEntry;
4834 protected:
4835     int createFlags_;
4836 public:
4837     OpenCLBufferPoolImpl(int createFlags = 0)
4838         : createFlags_(createFlags)
4839     {
4840     }
4841
4842     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4843     {
4844         CV_DbgAssert(entry.clBuffer_ == NULL);
4845         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4846         Context& ctx = Context::getDefault();
4847         cl_int retval = CL_SUCCESS;
4848         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4849         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4850         CV_Assert(entry.clBuffer_ != NULL);
4851         if(retval == CL_SUCCESS)
4852         {
4853             CV_IMPL_ADD(CV_IMPL_OCL);
4854         }
4855         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4856                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4857         allocatedEntries_.push_back(entry);
4858     }
4859
4860     void _releaseBufferEntry(const BufferEntry& entry)
4861     {
4862         CV_Assert(entry.capacity_ != 0);
4863         CV_Assert(entry.clBuffer_ != NULL);
4864         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4865                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4866         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4867     }
4868 };
4869
4870 #ifdef HAVE_OPENCL_SVM
4871 struct CLSVMBufferEntry
4872 {
4873     void* clBuffer_;
4874     size_t capacity_;
4875     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4876 };
4877 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4878 {
4879 public:
4880     typedef struct CLSVMBufferEntry BufferEntry;
4881 public:
4882     OpenCLSVMBufferPoolImpl()
4883     {
4884     }
4885
4886     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4887     {
4888         CV_DbgAssert(entry.clBuffer_ == NULL);
4889         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4890
4891         Context& ctx = Context::getDefault();
4892         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4893         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4894         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4895                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4896
4897         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4898         CV_DbgAssert(svmFns->isValid());
4899
4900         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4901         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4902         CV_Assert(buf);
4903
4904         entry.clBuffer_ = buf;
4905         {
4906             CV_IMPL_ADD(CV_IMPL_OCL);
4907         }
4908         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4909                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4910         allocatedEntries_.push_back(entry);
4911     }
4912
4913     void _releaseBufferEntry(const BufferEntry& entry)
4914     {
4915         CV_Assert(entry.capacity_ != 0);
4916         CV_Assert(entry.clBuffer_ != NULL);
4917         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4918                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4919         Context& ctx = Context::getDefault();
4920         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4921         CV_DbgAssert(svmFns->isValid());
4922         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4923         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4924     }
4925 };
4926 #endif
4927
4928
4929
4930 template <bool readAccess, bool writeAccess>
4931 class AlignedDataPtr
4932 {
4933 protected:
4934     const size_t size_;
4935     uchar* const originPtr_;
4936     const size_t alignment_;
4937     uchar* ptr_;
4938     uchar* allocatedPtr_;
4939
4940 public:
4941     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4942         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4943     {
4944         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4945         CV_DbgAssert(!readAccess || ptr);
4946         if (((size_t)ptr_ & (alignment - 1)) != 0)
4947         {
4948             allocatedPtr_ = new uchar[size_ + alignment - 1];
4949             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4950             if (readAccess)
4951             {
4952                 memcpy(ptr_, originPtr_, size_);
4953             }
4954         }
4955     }
4956
4957     uchar* getAlignedPtr() const
4958     {
4959         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4960         return ptr_;
4961     }
4962
4963     ~AlignedDataPtr()
4964     {
4965         if (allocatedPtr_)
4966         {
4967             if (writeAccess)
4968             {
4969                 memcpy(originPtr_, ptr_, size_);
4970             }
4971             delete[] allocatedPtr_;
4972             allocatedPtr_ = NULL;
4973         }
4974         ptr_ = NULL;
4975     }
4976 private:
4977     AlignedDataPtr(const AlignedDataPtr&); // disabled
4978     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4979 };
4980
4981 template <bool readAccess, bool writeAccess>
4982 class AlignedDataPtr2D
4983 {
4984 protected:
4985     const size_t size_;
4986     uchar* const originPtr_;
4987     const size_t alignment_;
4988     uchar* ptr_;
4989     uchar* allocatedPtr_;
4990     size_t rows_;
4991     size_t cols_;
4992     size_t step_;
4993
4994 public:
4995     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
4996         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
4997     {
4998         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4999         CV_DbgAssert(!readAccess || ptr != NULL);
5000         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5001         {
5002             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5003             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5004             if (readAccess)
5005             {
5006                 for (size_t i = 0; i < rows_; i++)
5007                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5008             }
5009         }
5010     }
5011
5012     uchar* getAlignedPtr() const
5013     {
5014         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5015         return ptr_;
5016     }
5017
5018     ~AlignedDataPtr2D()
5019     {
5020         if (allocatedPtr_)
5021         {
5022             if (writeAccess)
5023             {
5024                 for (size_t i = 0; i < rows_; i++)
5025                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5026             }
5027             delete[] allocatedPtr_;
5028             allocatedPtr_ = NULL;
5029         }
5030         ptr_ = NULL;
5031     }
5032 private:
5033     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5034     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5035 };
5036
5037 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5038 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5039 #endif
5040
5041
5042 void Context::Impl::__init_buffer_pools()
5043 {
5044     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5045     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5046     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5047     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5048
5049     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5050     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5051     bufferPool.setMaxReservedSize(poolSize);
5052     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5053     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5054
5055 #ifdef HAVE_OPENCL_SVM
5056     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5057     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5058     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5059     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5060 #endif
5061
5062     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5063 }
5064
5065 class OpenCLAllocator CV_FINAL : public MatAllocator
5066 {
5067 public:
5068     enum AllocatorFlags
5069     {
5070         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5071         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5072 #ifdef HAVE_OPENCL_SVM
5073         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5074 #endif
5075         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5076     };
5077
5078     OpenCLAllocator()
5079     {
5080         matStdAllocator = Mat::getDefaultAllocator();
5081     }
5082     ~OpenCLAllocator()
5083     {
5084         flushCleanupQueue();
5085     }
5086
5087     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5088             AccessFlag flags, UMatUsageFlags usageFlags) const
5089     {
5090         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5091         return u;
5092     }
5093
5094     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5095     {
5096         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5097         return value;
5098     }
5099     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5100     {
5101         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5102         return value;
5103     }
5104
5105     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5106     {
5107         const Device& dev = ctx.device(0);
5108         createFlags = 0;
5109         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5110             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5111
5112         if (!isOpenCLCopyingForced() &&
5113             (isOpenCLMapForced() ||
5114                 (dev.hostUnifiedMemory()
5115 #ifndef __APPLE__
5116                 || dev.isIntel()
5117 #endif
5118                 )
5119             )
5120         )
5121             flags0 = static_cast<UMatData::MemoryFlag>(0);
5122         else
5123             flags0 = UMatData::COPY_ON_MAP;
5124     }
5125
5126     UMatData* allocate(int dims, const int* sizes, int type,
5127                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5128     {
5129         if(!useOpenCL())
5130             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5131
5132         flushCleanupQueue();
5133
5134         CV_Assert(data == 0);
5135         size_t total = CV_ELEM_SIZE(type);
5136         for( int i = dims-1; i >= 0; i-- )
5137         {
5138             if( step )
5139                 step[i] = total;
5140             total *= sizes[i];
5141         }
5142
5143         Context& ctx = Context::getDefault();
5144         if (!ctx.getImpl())
5145             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5146         Context::Impl& ctxImpl = *ctx.getImpl();
5147
5148         int createFlags = 0;
5149         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5150         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5151
5152         void* handle = NULL;
5153         int allocatorFlags = 0;
5154
5155 #ifdef HAVE_OPENCL_SVM
5156         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5157         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5158         {
5159             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5160             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5161
5162             // this property is constant, so single buffer pool can be used here
5163             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5164             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5165         }
5166         else
5167 #endif
5168         if (createFlags == 0)
5169         {
5170             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5171             handle = ctxImpl.getBufferPool().allocate(total);
5172         }
5173         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5174         {
5175             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5176             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5177         }
5178         else
5179         {
5180             CV_Assert(handle != NULL); // Unsupported, throw
5181         }
5182
5183         if (!handle)
5184             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5185
5186         UMatData* u = new UMatData(this);
5187         u->data = 0;
5188         u->size = total;
5189         u->handle = handle;
5190         u->flags = flags0;
5191         u->allocatorFlags_ = allocatorFlags;
5192         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5193         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5194         u->markHostCopyObsolete(true);
5195         opencl_allocator_stats.onAllocate(u->size);
5196         return u;
5197     }
5198
5199     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5200     {
5201         if(!u)
5202             return false;
5203
5204         flushCleanupQueue();
5205
5206         UMatDataAutoLock lock(u);
5207
5208         if(u->handle == 0)
5209         {
5210             CV_Assert(u->origdata != 0);
5211             Context& ctx = Context::getDefault();
5212             int createFlags = 0;
5213             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5214             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5215
5216             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5217
5218             cl_context ctx_handle = (cl_context)ctx.ptr();
5219             int allocatorFlags = 0;
5220             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5221             void* handle = NULL;
5222             cl_int retval = CL_SUCCESS;
5223
5224 #ifdef HAVE_OPENCL_SVM
5225             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5226             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5227             if (useSVM && svmCaps.isSupportFineGrainSystem())
5228             {
5229                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5230                 tempUMatFlags = UMatData::TEMP_UMAT;
5231                 handle = u->origdata;
5232                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5233             }
5234             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5235             {
5236                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5237                 {
5238                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5239
5240                     cl_svm_mem_flags memFlags = createFlags |
5241                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5242
5243                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5244                     CV_DbgAssert(svmFns->isValid());
5245
5246                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5247                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5248                     CV_Assert(handle);
5249
5250                     cl_command_queue q = NULL;
5251                     if (!isFineGrainBuffer)
5252                     {
5253                         q = (cl_command_queue)Queue::getDefault().ptr();
5254                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5255                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5256                                 handle, u->size,
5257                                 0, NULL, NULL);
5258                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5259
5260                     }
5261                     memcpy(handle, u->origdata, u->size);
5262                     if (!isFineGrainBuffer)
5263                     {
5264                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5265                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5266                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5267                     }
5268
5269                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5270                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5271                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5272                 }
5273             }
5274             else
5275 #endif
5276             {
5277                 if( copyOnMap )
5278                     accessFlags &= ~ACCESS_FAST;
5279
5280                 tempUMatFlags = UMatData::TEMP_UMAT;
5281                 if (
5282                 #ifdef __APPLE__
5283                     !copyOnMap &&
5284                 #endif
5285                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5286                     // There are OpenCL runtime issues for less aligned data
5287                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5288                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5289                     // Avoid sharing of host memory between OpenCL buffers
5290                     && !(u->originalUMatData && u->originalUMatData->handle)
5291                 )
5292                 {
5293                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5294                                             u->size, u->origdata, &retval);
5295                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5296                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5297                 }
5298                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5299                 {
5300                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5301                                                u->size, u->origdata, &retval);
5302                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5303                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5304                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5305                 }
5306             }
5307             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5308             if(!handle || retval != CL_SUCCESS)
5309                 return false;
5310             u->handle = handle;
5311             u->prevAllocator = u->currAllocator;
5312             u->currAllocator = this;
5313             u->flags |= tempUMatFlags | flags0;
5314             u->allocatorFlags_ = allocatorFlags;
5315         }
5316         if (!!(accessFlags & ACCESS_WRITE))
5317             u->markHostCopyObsolete(true);
5318         opencl_allocator_stats.onAllocate(u->size);
5319         return true;
5320     }
5321
5322     /*void sync(UMatData* u) const
5323     {
5324         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5325         UMatDataAutoLock lock(u);
5326
5327         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5328         {
5329             if( u->tempCopiedUMat() )
5330             {
5331                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5332                                     u->size, u->origdata, 0, 0, 0);
5333             }
5334             else
5335             {
5336                 cl_int retval = 0;
5337                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5338                                                 (CL_MAP_READ | CL_MAP_WRITE),
5339                                                 0, u->size, 0, 0, 0, &retval);
5340                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5341                 clFinish(q);
5342             }
5343             u->markHostCopyObsolete(false);
5344         }
5345         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5346         {
5347             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5348                                  u->size, u->data, 0, 0, 0);
5349         }
5350     }*/
5351
5352     void deallocate(UMatData* u) const CV_OVERRIDE
5353     {
5354         if(!u)
5355             return;
5356
5357         CV_Assert(u->urefcount == 0);
5358         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5359
5360         CV_Assert(u->handle != 0);
5361         CV_Assert(u->mapcount == 0);
5362
5363         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5364             addToCleanupQueue(u);
5365         else
5366             deallocate_(u);
5367     }
5368
5369     void deallocate_(UMatData* u) const
5370     {
5371         CV_Assert(u);
5372         CV_Assert(u->handle);
5373         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5374         {
5375             opencl_allocator_stats.onFree(u->size);
5376         }
5377
5378 #ifdef _WIN32
5379         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5380             return;             // avoid any OpenCL calls
5381 #endif
5382         if(u->tempUMat())
5383         {
5384             CV_Assert(u->origdata);
5385 //            UMatDataAutoLock lock(u);
5386
5387             if (u->hostCopyObsolete())
5388             {
5389 #ifdef HAVE_OPENCL_SVM
5390                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5391                 {
5392                     Context& ctx = Context::getDefault();
5393                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5394                     CV_DbgAssert(svmFns->isValid());
5395
5396                     if( u->tempCopiedUMat() )
5397                     {
5398                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5399                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5400                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5401                         cl_command_queue q = NULL;
5402                         if (!isFineGrainBuffer)
5403                         {
5404                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5405                             q = (cl_command_queue)Queue::getDefault().ptr();
5406                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5407                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5408                                     u->handle, u->size,
5409                                     0, NULL, NULL);
5410                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5411                         }
5412                         clFinish(q);
5413                         memcpy(u->origdata, u->handle, u->size);
5414                         if (!isFineGrainBuffer)
5415                         {
5416                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5417                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5418                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5419                         }
5420                     }
5421                     else
5422                     {
5423                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5424                         // nothing
5425                     }
5426                 }
5427                 else
5428 #endif
5429                 {
5430                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5431                     if( u->tempCopiedUMat() )
5432                     {
5433                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5434                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5435                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5436                     }
5437                     else
5438                     {
5439                         cl_int retval = 0;
5440                         if (u->tempUMat())
5441                         {
5442                             CV_Assert(u->mapcount == 0);
5443                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5444                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5445                                 (CL_MAP_READ | CL_MAP_WRITE),
5446                                 0, u->size, 0, 0, 0, &retval);
5447                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5448                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5449                             if (u->originalUMatData)
5450                             {
5451                                 CV_Assert(u->originalUMatData->data == data);
5452                             }
5453                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5454                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5455                             CV_OCL_DBG_CHECK(clFinish(q));
5456                         }
5457                     }
5458                 }
5459                 u->markHostCopyObsolete(false);
5460             }
5461             else
5462             {
5463                 // nothing
5464             }
5465 #ifdef HAVE_OPENCL_SVM
5466             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5467             {
5468                 if( u->tempCopiedUMat() )
5469                 {
5470                     Context& ctx = Context::getDefault();
5471                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5472                     CV_DbgAssert(svmFns->isValid());
5473
5474                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5475                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5476                 }
5477             }
5478             else
5479 #endif
5480             {
5481                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5482                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5483             }
5484             u->handle = 0;
5485             u->markDeviceCopyObsolete(true);
5486             u->currAllocator = u->prevAllocator;
5487             u->prevAllocator = NULL;
5488             if(u->data && u->copyOnMap() && u->data != u->origdata)
5489                 fastFree(u->data);
5490             u->data = u->origdata;
5491             u->currAllocator->deallocate(u);
5492             u = NULL;
5493         }
5494         else
5495         {
5496             CV_Assert(u->origdata == NULL);
5497             if(u->data && u->copyOnMap() && u->data != u->origdata)
5498             {
5499                 fastFree(u->data);
5500                 u->data = 0;
5501                 u->markHostCopyObsolete(true);
5502             }
5503             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5504             {
5505                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5506                 CV_Assert(pCtx);
5507                 ocl::Context& ctx = *pCtx.get();
5508                 CV_Assert(ctx.getImpl());
5509                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5510             }
5511             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5512             {
5513                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5514                 CV_Assert(pCtx);
5515                 ocl::Context& ctx = *pCtx.get();
5516                 CV_Assert(ctx.getImpl());
5517                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5518             }
5519 #ifdef HAVE_OPENCL_SVM
5520             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5521             {
5522                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5523                 CV_Assert(pCtx);
5524                 ocl::Context& ctx = *pCtx.get();
5525                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5526                 {
5527                     //nothing
5528                 }
5529                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5530                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5531                 {
5532                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5533                     CV_DbgAssert(svmFns->isValid());
5534                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5535
5536                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5537                     {
5538                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5539                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5540                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5541                     }
5542                 }
5543                 CV_Assert(ctx.getImpl());
5544                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5545             }
5546 #endif
5547             else
5548             {
5549                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5550             }
5551             u->handle = 0;
5552             u->markDeviceCopyObsolete(true);
5553             delete u;
5554             u = NULL;
5555         }
5556         CV_Assert(u == NULL);
5557     }
5558
5559     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5560     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5561     {
5562         CV_Assert(u && u->handle);
5563
5564         if (!!(accessFlags & ACCESS_WRITE))
5565             u->markDeviceCopyObsolete(true);
5566
5567         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5568
5569         {
5570             if( !u->copyOnMap() )
5571             {
5572                 // TODO
5573                 // because there can be other map requests for the same UMat with different access flags,
5574                 // we use the universal (read-write) access mode.
5575 #ifdef HAVE_OPENCL_SVM
5576                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5577                 {
5578                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5579                     {
5580                         Context& ctx = Context::getDefault();
5581                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5582                         CV_DbgAssert(svmFns->isValid());
5583
5584                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5585                         {
5586                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5587                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5588                                     u->handle, u->size,
5589                                     0, NULL, NULL);
5590                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5591                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5592                         }
5593                     }
5594                     clFinish(q);
5595                     u->data = (uchar*)u->handle;
5596                     u->markHostCopyObsolete(false);
5597                     u->markDeviceMemMapped(true);
5598                     return;
5599                 }
5600 #endif
5601
5602                 cl_int retval = CL_SUCCESS;
5603                 if (!u->deviceMemMapped())
5604                 {
5605                     CV_Assert(u->refcount == 1);
5606                     CV_Assert(u->mapcount++ == 0);
5607                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5608                                                          (CL_MAP_READ | CL_MAP_WRITE),
5609                                                          0, u->size, 0, 0, 0, &retval);
5610                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5611                 }
5612                 if (u->data && retval == CL_SUCCESS)
5613                 {
5614                     u->markHostCopyObsolete(false);
5615                     u->markDeviceMemMapped(true);
5616                     return;
5617                 }
5618
5619                 // TODO Is it really a good idea and was it tested well?
5620                 // if map failed, switch to copy-on-map mode for the particular buffer
5621                 u->flags |= UMatData::COPY_ON_MAP;
5622             }
5623
5624             if(!u->data)
5625             {
5626                 u->data = (uchar*)fastMalloc(u->size);
5627                 u->markHostCopyObsolete(true);
5628             }
5629         }
5630
5631         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5632         {
5633             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5634 #ifdef HAVE_OPENCL_SVM
5635             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5636 #endif
5637             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5638                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5639             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5640                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5641             u->markHostCopyObsolete(false);
5642         }
5643     }
5644
5645     void unmap(UMatData* u) const CV_OVERRIDE
5646     {
5647         if(!u)
5648             return;
5649
5650
5651         CV_Assert(u->handle != 0);
5652
5653         UMatDataAutoLock autolock(u);
5654
5655         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5656         cl_int retval = 0;
5657         if( !u->copyOnMap() && u->deviceMemMapped() )
5658         {
5659             CV_Assert(u->data != NULL);
5660 #ifdef HAVE_OPENCL_SVM
5661             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5662             {
5663                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5664                 {
5665                     Context& ctx = Context::getDefault();
5666                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5667                     CV_DbgAssert(svmFns->isValid());
5668
5669                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5670                     {
5671                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5672                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5673                                 0, NULL, NULL);
5674                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5675                         clFinish(q);
5676                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5677                     }
5678                 }
5679                 if (u->refcount == 0)
5680                     u->data = 0;
5681                 u->markDeviceCopyObsolete(false);
5682                 u->markHostCopyObsolete(true);
5683                 return;
5684             }
5685 #endif
5686             if (u->refcount == 0)
5687             {
5688                 CV_Assert(u->mapcount-- == 1);
5689                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5690                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5691                 if (Device::getDefault().isAMD())
5692                 {
5693                     // required for multithreaded applications (see stitching test)
5694                     CV_OCL_DBG_CHECK(clFinish(q));
5695                 }
5696                 u->markDeviceMemMapped(false);
5697                 u->data = 0;
5698                 u->markDeviceCopyObsolete(false);
5699                 u->markHostCopyObsolete(true);
5700             }
5701         }
5702         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5703         {
5704             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5705 #ifdef HAVE_OPENCL_SVM
5706             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5707 #endif
5708             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5709                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5710             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5711                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5712             u->markDeviceCopyObsolete(false);
5713             u->markHostCopyObsolete(true);
5714         }
5715     }
5716
5717     bool checkContinuous(int dims, const size_t sz[],
5718                          const size_t srcofs[], const size_t srcstep[],
5719                          const size_t dstofs[], const size_t dststep[],
5720                          size_t& total, size_t new_sz[],
5721                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5722                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5723     {
5724         bool iscontinuous = true;
5725         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5726         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5727         total = sz[dims-1];
5728         for( int i = dims-2; i >= 0; i-- )
5729         {
5730             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5731                 iscontinuous = false;
5732             total *= sz[i];
5733             if( srcofs )
5734                 srcrawofs += srcofs[i]*srcstep[i];
5735             if( dstofs )
5736                 dstrawofs += dstofs[i]*dststep[i];
5737         }
5738
5739         if( !iscontinuous )
5740         {
5741             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5742             if( dims == 2 )
5743             {
5744                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5745                 // we assume that new_... arrays are initialized by caller
5746                 // with 0's, so there is no else branch
5747                 if( srcofs )
5748                 {
5749                     new_srcofs[0] = srcofs[1];
5750                     new_srcofs[1] = srcofs[0];
5751                     new_srcofs[2] = 0;
5752                 }
5753
5754                 if( dstofs )
5755                 {
5756                     new_dstofs[0] = dstofs[1];
5757                     new_dstofs[1] = dstofs[0];
5758                     new_dstofs[2] = 0;
5759                 }
5760
5761                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5762                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5763             }
5764             else
5765             {
5766                 // we could check for dims == 3 here,
5767                 // but from user perspective this one is more informative
5768                 CV_Assert(dims <= 3);
5769                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5770                 if( srcofs )
5771                 {
5772                     new_srcofs[0] = srcofs[2];
5773                     new_srcofs[1] = srcofs[1];
5774                     new_srcofs[2] = srcofs[0];
5775                 }
5776
5777                 if( dstofs )
5778                 {
5779                     new_dstofs[0] = dstofs[2];
5780                     new_dstofs[1] = dstofs[1];
5781                     new_dstofs[2] = dstofs[0];
5782                 }
5783
5784                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5785                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5786             }
5787         }
5788         return iscontinuous;
5789     }
5790
5791     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5792                   const size_t srcofs[], const size_t srcstep[],
5793                   const size_t dststep[]) const CV_OVERRIDE
5794     {
5795         if(!u)
5796             return;
5797         UMatDataAutoLock autolock(u);
5798
5799         if( u->data && !u->hostCopyObsolete() )
5800         {
5801             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5802             return;
5803         }
5804         CV_Assert( u->handle != 0 );
5805
5806         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5807
5808         size_t total = 0, new_sz[] = {0, 0, 0};
5809         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5810         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5811
5812         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5813                                             total, new_sz,
5814                                             srcrawofs, new_srcofs, new_srcstep,
5815                                             dstrawofs, new_dstofs, new_dststep);
5816
5817 #ifdef HAVE_OPENCL_SVM
5818         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5819         {
5820             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5821             Context& ctx = Context::getDefault();
5822             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5823             CV_DbgAssert(svmFns->isValid());
5824
5825             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5826             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5827             {
5828                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5829                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5830                         u->handle, u->size,
5831                         0, NULL, NULL);
5832                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5833             }
5834             clFinish(q);
5835             if( iscontinuous )
5836             {
5837                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5838             }
5839             else
5840             {
5841                 // This code is from MatAllocator::download()
5842                 int isz[CV_MAX_DIM];
5843                 uchar* srcptr = (uchar*)u->handle;
5844                 for( int i = 0; i < dims; i++ )
5845                 {
5846                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5847                     if( sz[i] == 0 )
5848                     return;
5849                     if( srcofs )
5850                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5851                     isz[i] = (int)sz[i];
5852                 }
5853
5854                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5855                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5856
5857                 const Mat* arrays[] = { &src, &dst };
5858                 uchar* ptrs[2];
5859                 NAryMatIterator it(arrays, ptrs, 2);
5860                 size_t j, planesz = it.size;
5861
5862                 for( j = 0; j < it.nplanes; j++, ++it )
5863                     memcpy(ptrs[1], ptrs[0], planesz);
5864             }
5865             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5866             {
5867                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5868                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5869                         0, NULL, NULL);
5870                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5871                 clFinish(q);
5872             }
5873         }
5874         else
5875 #endif
5876         {
5877             if( iscontinuous )
5878             {
5879                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5880                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5881                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5882             }
5883             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5884             {
5885                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5886                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5887                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5888                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5889                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5890                 uchar* ptr = alignedPtr.getAlignedPtr();
5891
5892                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5893                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5894                 total = std::min(total, u->size - new_srcrawofs);
5895                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5896                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5897                 for( size_t i = 0; i < new_sz[1]; i++ )
5898                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5899             }
5900             else
5901             {
5902                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5903                 uchar* ptr = alignedPtr.getAlignedPtr();
5904
5905                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5906                     new_srcofs, new_dstofs, new_sz,
5907                     new_srcstep[0], 0,
5908                     new_dststep[0], 0,
5909                     ptr, 0, 0, 0));
5910             }
5911         }
5912     }
5913
5914     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5915                 const size_t dstofs[], const size_t dststep[],
5916                 const size_t srcstep[]) const CV_OVERRIDE
5917     {
5918         if(!u)
5919             return;
5920
5921         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5922         CV_Assert(u->refcount == 0 || u->tempUMat());
5923
5924         size_t total = 0, new_sz[] = {0, 0, 0};
5925         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5926         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5927
5928         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5929                                             total, new_sz,
5930                                             srcrawofs, new_srcofs, new_srcstep,
5931                                             dstrawofs, new_dstofs, new_dststep);
5932
5933         UMatDataAutoLock autolock(u);
5934
5935         // if there is cached CPU copy of the GPU matrix,
5936         // we could use it as a destination.
5937         // we can do it in 2 cases:
5938         //    1. we overwrite the whole content
5939         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5940         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5941         {
5942             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5943             u->markHostCopyObsolete(false);
5944             u->markDeviceCopyObsolete(true);
5945             return;
5946         }
5947
5948         CV_Assert( u->handle != 0 );
5949         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5950
5951 #ifdef HAVE_OPENCL_SVM
5952         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5953         {
5954             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5955             Context& ctx = Context::getDefault();
5956             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5957             CV_DbgAssert(svmFns->isValid());
5958
5959             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5960             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5961             {
5962                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5963                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5964                         u->handle, u->size,
5965                         0, NULL, NULL);
5966                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5967             }
5968             clFinish(q);
5969             if( iscontinuous )
5970             {
5971                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
5972             }
5973             else
5974             {
5975                 // This code is from MatAllocator::upload()
5976                 int isz[CV_MAX_DIM];
5977                 uchar* dstptr = (uchar*)u->handle;
5978                 for( int i = 0; i < dims; i++ )
5979                 {
5980                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5981                     if( sz[i] == 0 )
5982                     return;
5983                     if( dstofs )
5984                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5985                     isz[i] = (int)sz[i];
5986                 }
5987
5988                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
5989                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5990
5991                 const Mat* arrays[] = { &src, &dst };
5992                 uchar* ptrs[2];
5993                 NAryMatIterator it(arrays, ptrs, 2);
5994                 size_t j, planesz = it.size;
5995
5996                 for( j = 0; j < it.nplanes; j++, ++it )
5997                     memcpy(ptrs[1], ptrs[0], planesz);
5998             }
5999             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6000             {
6001                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6002                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6003                         0, NULL, NULL);
6004                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6005                 clFinish(q);
6006             }
6007         }
6008         else
6009 #endif
6010         {
6011             if( iscontinuous )
6012             {
6013                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6014                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6015                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6016                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6017                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6018             }
6019             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6020             {
6021                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6022                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6023                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6024                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6025                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6026                 uchar* ptr = alignedPtr.getAlignedPtr();
6027
6028                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6029                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6030                 total = std::min(total, u->size - new_dstrawofs);
6031                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6032                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6033                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6034                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6035                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6036                 for( size_t i = 0; i < new_sz[1]; i++ )
6037                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6038                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6039                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6040             }
6041             else
6042             {
6043                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6044                 uchar* ptr = alignedPtr.getAlignedPtr();
6045
6046                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6047                     new_dstofs, new_srcofs, new_sz,
6048                     new_dststep[0], 0,
6049                     new_srcstep[0], 0,
6050                     ptr, 0, 0, 0));
6051             }
6052         }
6053         u->markHostCopyObsolete(true);
6054 #ifdef HAVE_OPENCL_SVM
6055         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6056                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6057         {
6058             // nothing
6059         }
6060         else
6061 #endif
6062         {
6063             u->markHostCopyObsolete(true);
6064         }
6065         u->markDeviceCopyObsolete(false);
6066     }
6067
6068     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6069               const size_t srcofs[], const size_t srcstep[],
6070               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6071     {
6072         if(!src || !dst)
6073             return;
6074
6075         size_t total = 0, new_sz[] = {0, 0, 0};
6076         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6077         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6078
6079         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6080                                             total, new_sz,
6081                                             srcrawofs, new_srcofs, new_srcstep,
6082                                             dstrawofs, new_dstofs, new_dststep);
6083
6084         UMatDataAutoLock src_autolock(src, dst);
6085
6086         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6087         {
6088             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6089             return;
6090         }
6091         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6092         {
6093             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6094             dst->markHostCopyObsolete(false);
6095 #ifdef HAVE_OPENCL_SVM
6096             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6097                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6098             {
6099                 // nothing
6100             }
6101             else
6102 #endif
6103             {
6104                 dst->markDeviceCopyObsolete(true);
6105             }
6106             return;
6107         }
6108
6109         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6110         CV_Assert(dst->refcount == 0);
6111         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6112
6113         cl_int retval = CL_SUCCESS;
6114 #ifdef HAVE_OPENCL_SVM
6115         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6116                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6117         {
6118             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6119                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6120             {
6121                 Context& ctx = Context::getDefault();
6122                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6123                 CV_DbgAssert(svmFns->isValid());
6124
6125                 if( iscontinuous )
6126                 {
6127                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6128                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6129                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6130                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6131                             total, 0, NULL, NULL);
6132                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6133                 }
6134                 else
6135                 {
6136                     clFinish(q);
6137                     // This code is from MatAllocator::download()/upload()
6138                     int isz[CV_MAX_DIM];
6139                     uchar* srcptr = (uchar*)src->handle;
6140                     for( int i = 0; i < dims; i++ )
6141                     {
6142                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6143                         if( sz[i] == 0 )
6144                         return;
6145                         if( srcofs )
6146                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6147                         isz[i] = (int)sz[i];
6148                     }
6149                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6150
6151                     uchar* dstptr = (uchar*)dst->handle;
6152                     for( int i = 0; i < dims; i++ )
6153                     {
6154                         if( dstofs )
6155                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6156                     }
6157                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6158
6159                     const Mat* arrays[] = { &m_src, &m_dst };
6160                     uchar* ptrs[2];
6161                     NAryMatIterator it(arrays, ptrs, 2);
6162                     size_t j, planesz = it.size;
6163
6164                     for( j = 0; j < it.nplanes; j++, ++it )
6165                         memcpy(ptrs[1], ptrs[0], planesz);
6166                 }
6167             }
6168             else
6169             {
6170                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6171                 {
6172                     map(src, ACCESS_READ);
6173                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6174                     unmap(src);
6175                 }
6176                 else
6177                 {
6178                     map(dst, ACCESS_WRITE);
6179                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6180                     unmap(dst);
6181                 }
6182             }
6183         }
6184         else
6185 #endif
6186         {
6187             if( iscontinuous )
6188             {
6189                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6190                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6191                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6192                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6193             }
6194             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6195             {
6196                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6197                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6198                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6199                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6200                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6201
6202                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6203                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6204                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6205                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6206                 uchar* srcptr = srcBuf.getAlignedPtr();
6207                 uchar* dstptr = dstBuf.getAlignedPtr();
6208
6209                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6210
6211                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6212                 src_total = std::min(src_total, src->size - new_srcrawofs);
6213                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6214                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6215
6216                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6217                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6218                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6219                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6220
6221                 for( size_t i = 0; i < new_sz[1]; i++ )
6222                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6223                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6224                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6225                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6226             }
6227             else
6228             {
6229                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6230                                                    new_srcofs, new_dstofs, new_sz,
6231                                                    new_srcstep[0], 0,
6232                                                    new_dststep[0], 0,
6233                                                    0, 0, 0));
6234             }
6235         }
6236         if (retval == CL_SUCCESS)
6237         {
6238             CV_IMPL_ADD(CV_IMPL_OCL)
6239         }
6240
6241 #ifdef HAVE_OPENCL_SVM
6242         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6243             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6244         {
6245             // nothing
6246         }
6247         else
6248 #endif
6249         {
6250             dst->markHostCopyObsolete(true);
6251         }
6252         dst->markDeviceCopyObsolete(false);
6253
6254         if( _sync )
6255         {
6256             CV_OCL_DBG_CHECK(clFinish(q));
6257         }
6258     }
6259
6260     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6261     {
6262         ocl::Context ctx = Context::getDefault();
6263         if (ctx.empty())
6264             return NULL;
6265 #ifdef HAVE_OPENCL_SVM
6266         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6267         {
6268             return &ctx.getImpl()->getBufferPoolSVM();
6269         }
6270 #endif
6271         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6272         {
6273             return &ctx.getImpl()->getBufferPoolHostPtr();
6274         }
6275         if (id != NULL && strcmp(id, "OCL") != 0)
6276         {
6277             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6278         }
6279         return &ctx.getImpl()->getBufferPool();
6280     }
6281
6282     MatAllocator* matStdAllocator;
6283
6284     mutable cv::Mutex cleanupQueueMutex;
6285     mutable std::deque<UMatData*> cleanupQueue;
6286
6287     void flushCleanupQueue() const
6288     {
6289         if (!cleanupQueue.empty())
6290         {
6291             std::deque<UMatData*> q;
6292             {
6293                 cv::AutoLock lock(cleanupQueueMutex);
6294                 q.swap(cleanupQueue);
6295             }
6296             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6297             {
6298                 deallocate_(*i);
6299             }
6300         }
6301     }
6302     void addToCleanupQueue(UMatData* u) const
6303     {
6304         //TODO: Validation check: CV_Assert(!u->tempUMat());
6305         {
6306             cv::AutoLock lock(cleanupQueueMutex);
6307             cleanupQueue.push_back(u);
6308         }
6309     }
6310 };
6311
6312 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6313 {
6314     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6315     g_isOpenCVActivated = true;
6316     return g_allocator;
6317 }
6318 MatAllocator* getOpenCLAllocator()
6319 {
6320     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6321 }
6322
6323 }} // namespace cv::ocl
6324
6325
6326 namespace cv {
6327
6328 // three funcs below are implemented in umatrix.cpp
6329 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6330               bool autoSteps = false );
6331 void finalizeHdr(UMat& m);
6332
6333 } // namespace cv
6334
6335
6336 namespace cv { namespace ocl {
6337
6338 /*
6339 // Convert OpenCL buffer memory to UMat
6340 */
6341 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6342 {
6343     int d = 2;
6344     int sizes[] = { rows, cols };
6345
6346     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6347
6348     dst.release();
6349
6350     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6351     dst.usageFlags = USAGE_DEFAULT;
6352
6353     setSize(dst, d, sizes, 0, true);
6354     dst.offset = 0;
6355
6356     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6357     cl_mem_object_type mem_type = 0;
6358
6359     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6360
6361     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6362
6363     size_t total = 0;
6364     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6365
6366     CV_OCL_CHECK(clRetainMemObject(memobj));
6367
6368     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6369     CV_Assert(total >= rows * step);
6370
6371     // attach clBuffer to UMatData
6372     dst.u = new UMatData(getOpenCLAllocator());
6373     dst.u->data            = 0;
6374     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6375     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6376     dst.u->handle          = cl_mem_buffer;
6377     dst.u->origdata        = 0;
6378     dst.u->prevAllocator   = 0;
6379     dst.u->size            = total;
6380
6381     finalizeHdr(dst);
6382     dst.addref();
6383
6384     return;
6385 } // convertFromBuffer()
6386
6387
6388 /*
6389 // Convert OpenCL image2d_t memory to UMat
6390 */
6391 void convertFromImage(void* cl_mem_image, UMat& dst)
6392 {
6393     cl_mem             clImage = (cl_mem)cl_mem_image;
6394     cl_mem_object_type mem_type = 0;
6395
6396     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6397
6398     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6399
6400     cl_image_format fmt = { 0, 0 };
6401     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6402
6403     int depth = CV_8U;
6404     switch (fmt.image_channel_data_type)
6405     {
6406     case CL_UNORM_INT8:
6407     case CL_UNSIGNED_INT8:
6408         depth = CV_8U;
6409         break;
6410
6411     case CL_SNORM_INT8:
6412     case CL_SIGNED_INT8:
6413         depth = CV_8S;
6414         break;
6415
6416     case CL_UNORM_INT16:
6417     case CL_UNSIGNED_INT16:
6418         depth = CV_16U;
6419         break;
6420
6421     case CL_SNORM_INT16:
6422     case CL_SIGNED_INT16:
6423         depth = CV_16S;
6424         break;
6425
6426     case CL_SIGNED_INT32:
6427         depth = CV_32S;
6428         break;
6429
6430     case CL_FLOAT:
6431         depth = CV_32F;
6432         break;
6433
6434     default:
6435         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6436     }
6437
6438     int type = CV_8UC1;
6439     switch (fmt.image_channel_order)
6440     {
6441     case CL_R:
6442         type = CV_MAKE_TYPE(depth, 1);
6443         break;
6444
6445     case CL_RGBA:
6446     case CL_BGRA:
6447     case CL_ARGB:
6448         type = CV_MAKE_TYPE(depth, 4);
6449         break;
6450
6451     default:
6452         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6453         break;
6454     }
6455
6456     size_t step = 0;
6457     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6458
6459     size_t w = 0;
6460     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6461
6462     size_t h = 0;
6463     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6464
6465     dst.create((int)h, (int)w, type);
6466
6467     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6468
6469     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6470
6471     size_t offset = 0;
6472     size_t src_origin[3] = { 0, 0, 0 };
6473     size_t region[3] = { w, h, 1 };
6474     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6475
6476     CV_OCL_CHECK(clFinish(q));
6477
6478     return;
6479 } // convertFromImage()
6480
6481
6482 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6483
6484 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6485 {
6486     cl_uint numDevices = 0;
6487     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6488     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6489     {
6490         CV_OCL_DBG_CHECK_RESULT(status,
6491             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6492     }
6493
6494     if (numDevices == 0)
6495     {
6496         devices.clear();
6497         return;
6498     }
6499
6500     devices.resize((size_t)numDevices);
6501     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6502 }
6503
6504 struct PlatformInfo::Impl
6505 {
6506     Impl(void* id)
6507     {
6508         refcount = 1;
6509         handle = *(cl_platform_id*)id;
6510         getDevices(devices, handle);
6511     }
6512
6513     String getStrProp(cl_platform_info prop) const
6514     {
6515         char buf[1024];
6516         size_t sz=0;
6517         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6518             sz < sizeof(buf) ? String(buf) : String();
6519     }
6520
6521     IMPLEMENT_REFCOUNTABLE();
6522     std::vector<cl_device_id> devices;
6523     cl_platform_id handle;
6524 };
6525
6526 PlatformInfo::PlatformInfo()
6527 {
6528     p = 0;
6529 }
6530
6531 PlatformInfo::PlatformInfo(void* platform_id)
6532 {
6533     p = new Impl(platform_id);
6534 }
6535
6536 PlatformInfo::~PlatformInfo()
6537 {
6538     if(p)
6539         p->release();
6540 }
6541
6542 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6543 {
6544     if (i.p)
6545         i.p->addref();
6546     p = i.p;
6547 }
6548
6549 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6550 {
6551     if (i.p != p)
6552     {
6553         if (i.p)
6554             i.p->addref();
6555         if (p)
6556             p->release();
6557         p = i.p;
6558     }
6559     return *this;
6560 }
6561
6562 int PlatformInfo::deviceNumber() const
6563 {
6564     return p ? (int)p->devices.size() : 0;
6565 }
6566
6567 void PlatformInfo::getDevice(Device& device, int d) const
6568 {
6569     CV_Assert(p && d < (int)p->devices.size() );
6570     if(p)
6571         device.set(p->devices[d]);
6572 }
6573
6574 String PlatformInfo::name() const
6575 {
6576     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6577 }
6578
6579 String PlatformInfo::vendor() const
6580 {
6581     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6582 }
6583
6584 String PlatformInfo::version() const
6585 {
6586     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6587 }
6588
6589 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6590 {
6591     cl_uint numPlatforms = 0;
6592     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6593
6594     if (numPlatforms == 0)
6595     {
6596         platforms.clear();
6597         return;
6598     }
6599
6600     platforms.resize((size_t)numPlatforms);
6601     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6602 }
6603
6604 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6605 {
6606     std::vector<cl_platform_id> platforms;
6607     getPlatforms(platforms);
6608
6609     for (size_t i = 0; i < platforms.size(); i++)
6610         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6611 }
6612
6613 const char* typeToStr(int type)
6614 {
6615     static const char* tab[]=
6616     {
6617         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6618         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6619         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6620         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6621         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6622         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6623         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6624         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6625         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6626     };
6627     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6628     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6629     CV_Assert(result);
6630     return result;
6631 }
6632
6633 const char* memopTypeToStr(int type)
6634 {
6635     static const char* tab[] =
6636     {
6637         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6638         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6639         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6640         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6641         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6642         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6643         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6644         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6645         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6646     };
6647     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6648     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6649     CV_Assert(result);
6650     return result;
6651 }
6652
6653 const char* vecopTypeToStr(int type)
6654 {
6655     static const char* tab[] =
6656     {
6657         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6658         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6659         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6660         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6661         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6662         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6663         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6664         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6665         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6666     };
6667     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6668     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6669     CV_Assert(result);
6670     return result;
6671 }
6672
6673 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6674 {
6675     if( sdepth == ddepth )
6676         return "noconvert";
6677     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6678     if( ddepth >= CV_32F ||
6679         (ddepth == CV_32S && sdepth < CV_32S) ||
6680         (ddepth == CV_16S && sdepth <= CV_8S) ||
6681         (ddepth == CV_16U && sdepth == CV_8U))
6682     {
6683         sprintf(buf, "convert_%s", typestr);
6684     }
6685     else if( sdepth >= CV_32F )
6686         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6687     else
6688         sprintf(buf, "convert_%s_sat", typestr);
6689
6690     return buf;
6691 }
6692
6693 const char* getOpenCLErrorString(int errorCode)
6694 {
6695 #define CV_OCL_CODE(id) case id: return #id
6696 #define CV_OCL_CODE_(id, name) case id: return #name
6697     switch (errorCode)
6698     {
6699     CV_OCL_CODE(CL_SUCCESS);
6700     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6701     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6702     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6703     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6704     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6705     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6706     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6707     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6708     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6709     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6710     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6711     CV_OCL_CODE(CL_MAP_FAILURE);
6712     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6713     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6714     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6715     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6716     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6717     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6718     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6719     CV_OCL_CODE(CL_INVALID_VALUE);
6720     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6721     CV_OCL_CODE(CL_INVALID_PLATFORM);
6722     CV_OCL_CODE(CL_INVALID_DEVICE);
6723     CV_OCL_CODE(CL_INVALID_CONTEXT);
6724     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6725     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6726     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6727     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6728     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6729     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6730     CV_OCL_CODE(CL_INVALID_SAMPLER);
6731     CV_OCL_CODE(CL_INVALID_BINARY);
6732     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6733     CV_OCL_CODE(CL_INVALID_PROGRAM);
6734     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6735     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6736     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6737     CV_OCL_CODE(CL_INVALID_KERNEL);
6738     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6739     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6740     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6741     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6742     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6743     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6744     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6745     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6746     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6747     CV_OCL_CODE(CL_INVALID_EVENT);
6748     CV_OCL_CODE(CL_INVALID_OPERATION);
6749     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6750     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6751     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6752     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6753     // OpenCL 1.1
6754     CV_OCL_CODE(CL_INVALID_PROPERTY);
6755     // OpenCL 1.2
6756     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6757     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6758     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6759     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6760     // OpenCL 2.0
6761     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6762     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6763     // Extensions
6764     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6765     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6766     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6767     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6768     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6769     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6770     default: return "Unknown OpenCL error";
6771     }
6772 #undef CV_OCL_CODE
6773 #undef CV_OCL_CODE_
6774 }
6775
6776 template <typename T>
6777 static std::string kerToStr(const Mat & k)
6778 {
6779     int width = k.cols - 1, depth = k.depth();
6780     const T * const data = k.ptr<T>();
6781
6782     std::ostringstream stream;
6783     stream.precision(10);
6784
6785     if (depth <= CV_8S)
6786     {
6787         for (int i = 0; i < width; ++i)
6788             stream << "DIG(" << (int)data[i] << ")";
6789         stream << "DIG(" << (int)data[width] << ")";
6790     }
6791     else if (depth == CV_32F)
6792     {
6793         stream.setf(std::ios_base::showpoint);
6794         for (int i = 0; i < width; ++i)
6795             stream << "DIG(" << data[i] << "f)";
6796         stream << "DIG(" << data[width] << "f)";
6797     }
6798     else
6799     {
6800         for (int i = 0; i < width; ++i)
6801             stream << "DIG(" << data[i] << ")";
6802         stream << "DIG(" << data[width] << ")";
6803     }
6804
6805     return stream.str();
6806 }
6807
6808 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6809 {
6810     Mat kernel = _kernel.getMat().reshape(1, 1);
6811
6812     int depth = kernel.depth();
6813     if (ddepth < 0)
6814         ddepth = depth;
6815
6816     if (ddepth != depth)
6817         kernel.convertTo(kernel, ddepth);
6818
6819     typedef std::string (* func_t)(const Mat &);
6820     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6821                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6822     const func_t func = funcs[ddepth];
6823     CV_Assert(func != 0);
6824
6825     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6826 }
6827
6828 #define PROCESS_SRC(src) \
6829     do \
6830     { \
6831         if (!src.empty()) \
6832         { \
6833             CV_Assert(src.isMat() || src.isUMat()); \
6834             Size csize = src.size(); \
6835             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6836                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6837             if (cwidth < ckercn || ckercn <= 0) \
6838                 return 1; \
6839             cols.push_back(cwidth); \
6840             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6841                 return 1; \
6842             offsets.push_back(src.offset()); \
6843             steps.push_back(src.step()); \
6844             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6845             kercns.push_back(ckercn); \
6846         } \
6847     } \
6848     while ((void)0, 0)
6849
6850 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6851                               InputArray src4, InputArray src5, InputArray src6,
6852                               InputArray src7, InputArray src8, InputArray src9,
6853                               OclVectorStrategy strat)
6854 {
6855     const ocl::Device & d = ocl::Device::getDefault();
6856
6857     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6858         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6859         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6860         d.preferredVectorWidthDouble(), -1 };
6861
6862     // if the device says don't use vectors
6863     if (vectorWidths[0] == 1)
6864     {
6865         // it's heuristic
6866         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6867         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6868         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6869     }
6870
6871     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6872 }
6873
6874 int checkOptimalVectorWidth(const int *vectorWidths,
6875                             InputArray src1, InputArray src2, InputArray src3,
6876                             InputArray src4, InputArray src5, InputArray src6,
6877                             InputArray src7, InputArray src8, InputArray src9,
6878                             OclVectorStrategy strat)
6879 {
6880     CV_Assert(vectorWidths);
6881
6882     int ref_type = src1.type();
6883
6884     std::vector<size_t> offsets, steps, cols;
6885     std::vector<int> dividers, kercns;
6886     PROCESS_SRC(src1);
6887     PROCESS_SRC(src2);
6888     PROCESS_SRC(src3);
6889     PROCESS_SRC(src4);
6890     PROCESS_SRC(src5);
6891     PROCESS_SRC(src6);
6892     PROCESS_SRC(src7);
6893     PROCESS_SRC(src8);
6894     PROCESS_SRC(src9);
6895
6896     size_t size = offsets.size();
6897
6898     for (size_t i = 0; i < size; ++i)
6899         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6900             dividers[i] >>= 1, kercns[i] >>= 1;
6901
6902     // default strategy
6903     int kercn = *std::min_element(kercns.begin(), kercns.end());
6904
6905     return kercn;
6906 }
6907
6908 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6909                                  InputArray src4, InputArray src5, InputArray src6,
6910                                  InputArray src7, InputArray src8, InputArray src9)
6911 {
6912     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6913 }
6914
6915 #undef PROCESS_SRC
6916
6917
6918 // TODO Make this as a method of OpenCL "BuildOptions" class
6919 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6920 {
6921     if (!buildOptions.empty())
6922         buildOptions += " ";
6923     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6924     buildOptions += format(
6925             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6926             name.c_str(), ocl::typeToStr(type),
6927             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6928             name.c_str(), (int)CV_MAT_CN(type),
6929             name.c_str(), (int)CV_ELEM_SIZE(type),
6930             name.c_str(), (int)CV_ELEM_SIZE1(type),
6931             name.c_str(), (int)depth
6932             );
6933 }
6934
6935
6936 struct Image2D::Impl
6937 {
6938     Impl(const UMat &src, bool norm, bool alias)
6939     {
6940         handle = 0;
6941         refcount = 1;
6942         init(src, norm, alias);
6943     }
6944
6945     ~Impl()
6946     {
6947         if (handle)
6948             clReleaseMemObject(handle);
6949     }
6950
6951     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6952     {
6953         cl_image_format format;
6954         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6955                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6956         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6957                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6958         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6959
6960         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6961         int channelOrder = channelOrders[cn];
6962         format.image_channel_data_type = (cl_channel_type)channelType;
6963         format.image_channel_order = (cl_channel_order)channelOrder;
6964         return format;
6965     }
6966
6967     static bool isFormatSupported(cl_image_format format)
6968     {
6969         if (!haveOpenCL())
6970             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6971
6972         cl_context context = (cl_context)Context::getDefault().ptr();
6973         if (!context)
6974             return false;
6975
6976         // Figure out how many formats are supported by this context.
6977         cl_uint numFormats = 0;
6978         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6979                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
6980                                                 NULL, &numFormats);
6981         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
6982         if (numFormats > 0)
6983         {
6984             AutoBuffer<cl_image_format> formats(numFormats);
6985             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6986                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
6987                                              formats.data(), NULL);
6988             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
6989             for (cl_uint i = 0; i < numFormats; ++i)
6990             {
6991                 if (!memcmp(&formats[i], &format, sizeof(format)))
6992                 {
6993                     return true;
6994                 }
6995             }
6996         }
6997         return false;
6998     }
6999
7000     void init(const UMat &src, bool norm, bool alias)
7001     {
7002         if (!haveOpenCL())
7003             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7004
7005         CV_Assert(!src.empty());
7006         CV_Assert(ocl::Device::getDefault().imageSupport());
7007
7008         int err, depth = src.depth(), cn = src.channels();
7009         CV_Assert(cn <= 4);
7010         cl_image_format format = getImageFormat(depth, cn, norm);
7011
7012         if (!isFormatSupported(format))
7013             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7014
7015         if (alias && !src.handle(ACCESS_RW))
7016             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7017
7018         cl_context context = (cl_context)Context::getDefault().ptr();
7019         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7020
7021 #ifdef CL_VERSION_1_2
7022         // this enables backwards portability to
7023         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7024         const Device & d = ocl::Device::getDefault();
7025         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7026         CV_Assert(!alias || canCreateAlias(src));
7027         if (1 < major || (1 == major && 2 <= minor))
7028         {
7029             cl_image_desc desc;
7030             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7031             desc.image_width      = src.cols;
7032             desc.image_height     = src.rows;
7033             desc.image_depth      = 0;
7034             desc.image_array_size = 1;
7035             desc.image_row_pitch  = alias ? src.step[0] : 0;
7036             desc.image_slice_pitch = 0;
7037             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7038             desc.num_mip_levels   = 0;
7039             desc.num_samples      = 0;
7040             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7041         }
7042         else
7043 #endif
7044         {
7045             CV_SUPPRESS_DEPRECATED_START
7046             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7047             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7048             CV_SUPPRESS_DEPRECATED_END
7049         }
7050         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7051
7052         size_t origin[] = { 0, 0, 0 };
7053         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7054
7055         cl_mem devData;
7056         if (!alias && !src.isContinuous())
7057         {
7058             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7059             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7060                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7061                 ).c_str());
7062
7063             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7064             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7065                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7066             CV_OCL_DBG_CHECK(clFlush(queue));
7067         }
7068         else
7069         {
7070             devData = (cl_mem)src.handle(ACCESS_READ);
7071         }
7072         CV_Assert(devData != NULL);
7073
7074         if (!alias)
7075         {
7076             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7077             if (!src.isContinuous())
7078             {
7079                 CV_OCL_DBG_CHECK(clFlush(queue));
7080                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7081             }
7082         }
7083     }
7084
7085     IMPLEMENT_REFCOUNTABLE();
7086
7087     cl_mem handle;
7088 };
7089
7090 Image2D::Image2D()
7091 {
7092     p = NULL;
7093 }
7094
7095 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7096 {
7097     p = new Impl(src, norm, alias);
7098 }
7099
7100 bool Image2D::canCreateAlias(const UMat &m)
7101 {
7102     bool ret = false;
7103     const Device & d = ocl::Device::getDefault();
7104     if (d.imageFromBufferSupport() && !m.empty())
7105     {
7106         // This is the required pitch alignment in pixels
7107         uint pitchAlign = d.imagePitchAlignment();
7108         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7109         {
7110             // We don't currently handle the case where the buffer was created
7111             // with CL_MEM_USE_HOST_PTR
7112             if (!m.u->tempUMat())
7113             {
7114                 ret = true;
7115             }
7116         }
7117     }
7118     return ret;
7119 }
7120
7121 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7122 {
7123     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7124
7125     return Impl::isFormatSupported(format);
7126 }
7127
7128 Image2D::Image2D(const Image2D & i)
7129 {
7130     p = i.p;
7131     if (p)
7132         p->addref();
7133 }
7134
7135 Image2D & Image2D::operator = (const Image2D & i)
7136 {
7137     if (i.p != p)
7138     {
7139         if (i.p)
7140             i.p->addref();
7141         if (p)
7142             p->release();
7143         p = i.p;
7144     }
7145     return *this;
7146 }
7147
7148 Image2D::~Image2D()
7149 {
7150     if (p)
7151         p->release();
7152 }
7153
7154 void* Image2D::ptr() const
7155 {
7156     return p ? p->handle : 0;
7157 }
7158
7159 bool internal::isOpenCLForced()
7160 {
7161     static bool initialized = false;
7162     static bool value = false;
7163     if (!initialized)
7164     {
7165         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7166         initialized = true;
7167     }
7168     return value;
7169 }
7170
7171 bool internal::isPerformanceCheckBypassed()
7172 {
7173     static bool initialized = false;
7174     static bool value = false;
7175     if (!initialized)
7176     {
7177         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7178         initialized = true;
7179     }
7180     return value;
7181 }
7182
7183 bool internal::isCLBuffer(UMat& u)
7184 {
7185     void* h = u.handle(ACCESS_RW);
7186     if (!h)
7187         return true;
7188     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7189 #if 1
7190     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7191         return false;
7192 #else
7193     cl_mem_object_type type = 0;
7194     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7195     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7196         return false;
7197 #endif
7198     return true;
7199 }
7200
7201 struct Timer::Impl
7202 {
7203     const Queue queue;
7204
7205     Impl(const Queue& q)
7206         : queue(q)
7207     {
7208     }
7209
7210     ~Impl(){}
7211
7212     void start()
7213     {
7214         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7215         timer.start();
7216     }
7217
7218     void stop()
7219     {
7220         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7221         timer.stop();
7222     }
7223
7224     uint64 durationNS() const
7225     {
7226         return (uint64)(timer.getTimeSec() * 1e9);
7227     }
7228
7229     TickMeter timer;
7230 };
7231
7232 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7233 Timer::~Timer() { delete p; }
7234
7235 void Timer::start()
7236 {
7237     CV_Assert(p);
7238     p->start();
7239 }
7240
7241 void Timer::stop()
7242 {
7243     CV_Assert(p);
7244     p->stop();
7245 }
7246
7247 uint64 Timer::durationNS() const
7248 {
7249     CV_Assert(p);
7250     return p->durationNS();
7251 }
7252
7253 }} // namespace
7254
7255 #endif // HAVE_OPENCL