Merge remote-tracking branch 'upstream/3.4' into merge-3.4
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_DIRECTX
117 #include "directx.hpp"
118 #endif
119
120 #ifdef HAVE_OPENCL_SVM
121 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
122 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
123 #include "opencv2/core/opencl/opencl_svm.hpp"
124 #endif
125
126 #include "umatrix.hpp"
127
128 namespace cv { namespace ocl {
129
130 #define IMPLEMENT_REFCOUNTABLE() \
131     void addref() { CV_XADD(&refcount, 1); } \
132     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
133     int refcount
134
135 static cv::utils::AllocatorStatistics opencl_allocator_stats;
136
137 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
138 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
139 {
140     return opencl_allocator_stats;
141 }
142
143 #ifndef _DEBUG
144 static bool isRaiseError()
145 {
146     static bool initialized = false;
147     static bool value = false;
148     if (!initialized)
149     {
150         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
151         initialized = true;
152     }
153     return value;
154 }
155 #endif
156
157 #if CV_OPENCL_TRACE_CHECK
158 static inline
159 void traceOpenCLCheck(cl_int status, const char* message)
160 {
161     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
162 }
163 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
164 #else
165 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
166 #endif
167
168 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
169     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
170
171 #define CV_OCL_CHECK_RESULT(check_result, msg) \
172     do { \
173         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
174         if (check_result != CL_SUCCESS) \
175         { \
176             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
177             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
178             CV_Error(Error::OpenCLApiCallError, error_msg); \
179         } \
180     } while (0)
181
182 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
183
184 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
185
186 #ifdef _DEBUG
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
188 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
189 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
190 #else
191 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
192     do { \
193         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
194         if (check_result != CL_SUCCESS && isRaiseError()) \
195         { \
196             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
197             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
198             CV_Error(Error::OpenCLApiCallError, error_msg); \
199         } \
200     } while (0)
201 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
202 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
203 #endif
204
205
206 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
207 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
208 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
209 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
210
211 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
212 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
213 #endif
214
215 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
216 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
217 #ifdef __APPLE__
218         true
219 #else
220         false
221 #endif
222 );
223
224 static const String getBuildExtraOptions()
225 {
226     static String param_buildExtraOptions;
227     static bool initialized = false;
228     if (!initialized)
229     {
230         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
231         initialized = true;
232         if (!param_buildExtraOptions.empty())
233             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
234     }
235     return param_buildExtraOptions;
236 }
237
238 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
239 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
240
241
242 struct UMat2D
243 {
244     UMat2D(const UMat& m)
245     {
246         offset = (int)m.offset;
247         step = (int)m.step;
248         rows = m.rows;
249         cols = m.cols;
250     }
251     int offset;
252     int step;
253     int rows;
254     int cols;
255 };
256
257 struct UMat3D
258 {
259     UMat3D(const UMat& m)
260     {
261         offset = (int)m.offset;
262         step = (int)m.step.p[1];
263         slicestep = (int)m.step.p[0];
264         slices = (int)m.size.p[0];
265         rows = m.size.p[1];
266         cols = m.size.p[2];
267     }
268     int offset;
269     int slicestep;
270     int step;
271     int slices;
272     int rows;
273     int cols;
274 };
275
276 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
277 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
278 {
279     static uint64 table[256];
280     static bool initialized = false;
281
282     if( !initialized )
283     {
284         for( int i = 0; i < 256; i++ )
285         {
286             uint64 c = i;
287             for( int j = 0; j < 8; j++ )
288                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
289             table[i] = c;
290         }
291         initialized = true;
292     }
293
294     uint64 crc = ~crc0;
295     for( size_t idx = 0; idx < size; idx++ )
296         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
297
298     return ~crc;
299 }
300
301 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
302 struct OpenCLBinaryCacheConfigurator
303 {
304     cv::String cache_path_;
305     cv::String cache_lock_filename_;
306     cv::Ptr<utils::fs::FileLock> cache_lock_;
307
308     typedef std::map<std::string, std::string> ContextCacheType;
309     ContextCacheType prepared_contexts_;
310     Mutex mutex_prepared_contexts_;
311
312     OpenCLBinaryCacheConfigurator()
313     {
314         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
315         if (!CV_OPENCL_CACHE_ENABLE)
316         {
317             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
318             return;
319         }
320         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
321         if (cache_path_.empty())
322         {
323             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
324         }
325         do
326         {
327             try
328             {
329                 if (cache_path_.empty())
330                     break;
331                 if (cache_path_ == "disabled")
332                     break;
333                 if (!utils::fs::createDirectories(cache_path_))
334                 {
335                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
336                     clear();
337                     break;
338                 }
339
340                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
341                 {
342                     cache_lock_filename_ = cache_path_ + ".lock";
343                     if (!utils::fs::exists(cache_lock_filename_))
344                     {
345                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
346                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
347                         if (!lock_filename.is_open())
348                         {
349                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
350                             break;
351                         }
352                     }
353
354                     try
355                     {
356                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
358                         {
359                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
360                         }
361                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
362                     }
363                     catch (const cv::Exception& e)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
366                     }
367                     catch (...)
368                     {
369                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
370                     }
371                 }
372                 else
373                 {
374                     if (CV_OPENCL_CACHE_WRITE)
375                     {
376                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
377                                 "(not safe for multiprocess environment)");
378                     }
379                     else
380                     {
381                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
382                     }
383                 }
384             }
385             catch (const cv::Exception& e)
386             {
387                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
388                 clear();
389             }
390         } while (0);
391         if (!cache_path_.empty())
392         {
393             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
394             {
395                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
396                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
397             }
398             else
399             {
400                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
401             }
402         }
403     }
404
405     void clear()
406     {
407         cache_path_.clear();
408         cache_lock_filename_.clear();
409         cache_lock_.release();
410     }
411
412     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
413             const std::string& cleanup_prefix)
414     {
415         if (cache_path_.empty())
416             return std::string();
417
418         AutoLock lock(mutex_prepared_contexts_);
419
420         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
421         if (found_it != prepared_contexts_.end())
422             return found_it->second;
423
424         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
425
426         std::string target_directory = cache_path_ + ctx_prefix + "/";
427         bool result = utils::fs::isDirectory(target_directory);
428         if (!result)
429         {
430             try
431             {
432                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
433                 if (utils::fs::createDirectories(target_directory))
434                 {
435                     result = true;
436                 }
437                 else
438                 {
439                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
440                 }
441             }
442             catch (const cv::Exception& e)
443             {
444                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
445             }
446         }
447         target_directory = result ? target_directory : std::string();
448         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
449
450         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
451         {
452             try
453             {
454                 std::vector<String> entries;
455                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
456                 std::vector<String> remove_entries;
457                 for (size_t i = 0; i < entries.size(); i++)
458                 {
459                     const String& name = entries[i];
460                     if (0 == name.find(cleanup_prefix))
461                     {
462                         if (0 == name.find(ctx_prefix))
463                             continue; // skip current
464                         remove_entries.push_back(name);
465                     }
466                 }
467                 if (!remove_entries.empty())
468                 {
469                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
470                             ? "Detected OpenCL cache directory for other version of OpenCL device."
471                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
472                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
473                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
474                     for (size_t i = 0; i < remove_entries.size(); i++)
475                     {
476                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
477                     }
478                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
479
480                     for (size_t i = 0; i < remove_entries.size(); i++)
481                     {
482                         const String& name = remove_entries[i];
483                         cv::String path = utils::fs::join(cache_path_, name);
484                         try
485                         {
486                             utils::fs::remove_all(path);
487                             CV_LOG_WARNING(NULL, "Removed: " << path);
488                         }
489                         catch (const cv::Exception& e)
490                         {
491                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
492                         }
493                     }
494                 }
495             }
496             catch (...)
497             {
498                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
499             }
500         }
501
502         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
503         return target_directory;
504     }
505
506     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
507     {
508         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
509     }
510 };
511 class BinaryProgramFile
512 {
513     enum { MAX_ENTRIES = 64 };
514
515     typedef unsigned int uint32_t;
516
517     struct CV_DECL_ALIGNED(4) FileHeader
518     {
519         uint32_t sourceSignatureSize;
520         //char sourceSignature[];
521     };
522
523     struct CV_DECL_ALIGNED(4) FileTable
524     {
525         uint32_t numberOfEntries;
526         //uint32_t firstEntryOffset[];
527     };
528
529     struct CV_DECL_ALIGNED(4) FileEntry
530     {
531         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
532         uint32_t keySize;
533         uint32_t dataSize;
534         //char key[];
535         //char data[];
536     };
537
538     const std::string fileName_;
539     const char* const sourceSignature_;
540     const size_t sourceSignatureSize_;
541
542     std::fstream f;
543
544     uint32_t entryOffsets[MAX_ENTRIES];
545
546     uint32_t getHash(const std::string& options)
547     {
548         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
549         return hash & (MAX_ENTRIES - 1);
550     }
551
552     inline size_t getFileSize()
553     {
554         size_t pos = (size_t)f.tellg();
555         f.seekg(0, std::fstream::end);
556         size_t fileSize = (size_t)f.tellg();
557         f.seekg(pos, std::fstream::beg);
558         return fileSize;
559     }
560     inline uint32_t readUInt32()
561     {
562         uint32_t res = 0;
563         f.read((char*)&res, sizeof(uint32_t));
564         CV_Assert(!f.fail());
565         return res;
566     }
567     inline void writeUInt32(const uint32_t value)
568     {
569         uint32_t v = value;
570         f.write((char*)&v, sizeof(uint32_t));
571         CV_Assert(!f.fail());
572     }
573
574     inline void seekReadAbsolute(size_t pos)
575     {
576         f.seekg(pos, std::fstream::beg);
577         CV_Assert(!f.fail());
578     }
579     inline void seekReadRelative(size_t pos)
580     {
581         f.seekg(pos, std::fstream::cur);
582         CV_Assert(!f.fail());
583     }
584
585     inline void seekWriteAbsolute(size_t pos)
586     {
587         f.seekp(pos, std::fstream::beg);
588         CV_Assert(!f.fail());
589     }
590
591     void clearFile()
592     {
593         f.close();
594         if (0 != remove(fileName_.c_str()))
595             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
596         return;
597     }
598
599 public:
600     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
601         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
602     {
603         CV_StaticAssert(sizeof(uint32_t) == 4, "");
604         CV_Assert(sourceSignature_ != NULL);
605         CV_Assert(sourceSignatureSize_ > 0);
606         memset(entryOffsets, 0, sizeof(entryOffsets));
607
608         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
609         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
610         if(f.is_open() && getFileSize() > 0)
611         {
612             bool isValid = false;
613             try
614             {
615                 uint32_t fileSourceSignatureSize = readUInt32();
616                 if (fileSourceSignatureSize == sourceSignatureSize_)
617                 {
618                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
619                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
620                     if (f.eof())
621                     {
622                         CV_LOG_ERROR(NULL, "Unexpected EOF");
623                     }
624                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
625                     {
626                         isValid = true;
627                     }
628                 }
629                 if (!isValid)
630                 {
631                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
632                 }
633             }
634             catch (const cv::Exception& e)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
637             }
638             catch (...)
639             {
640                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
641             }
642             if (!isValid)
643             {
644                 clearFile();
645             }
646             else
647             {
648                 seekReadAbsolute(0);
649             }
650         }
651     }
652
653     bool read(const std::string& key, std::vector<char>& buf)
654     {
655         if (!f.is_open())
656             return false;
657
658         size_t fileSize = getFileSize();
659         if (fileSize == 0)
660         {
661             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
662             clearFile();
663             return false;
664         }
665         seekReadAbsolute(0);
666
667         // bypass FileHeader
668         uint32_t fileSourceSignatureSize = readUInt32();
669         CV_Assert(fileSourceSignatureSize > 0);
670         seekReadRelative(fileSourceSignatureSize);
671
672         uint32_t numberOfEntries = readUInt32();
673         CV_Assert(numberOfEntries > 0);
674         if (numberOfEntries != MAX_ENTRIES)
675         {
676             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
677             clearFile();
678             return false;
679         }
680         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
681         CV_Assert(!f.fail());
682
683         uint32_t entryNum = getHash(key);
684
685         uint32_t entryOffset = entryOffsets[entryNum];
686         FileEntry entry;
687         while (entryOffset > 0)
688         {
689             seekReadAbsolute(entryOffset);
690             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
691             f.read((char*)&entry, sizeof(entry));
692             CV_Assert(!f.fail());
693             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
694             if (key.size() == entry.keySize)
695             {
696                 if (entry.keySize > 0)
697                 {
698                     f.read(fileKey.data(), entry.keySize);
699                     CV_Assert(!f.fail());
700                 }
701                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
702                 {
703                     buf.resize(entry.dataSize);
704                     f.read(&buf[0], entry.dataSize);
705                     CV_Assert(!f.fail());
706                     seekReadAbsolute(0);
707                     CV_LOG_VERBOSE(NULL, 0, "Read...");
708                     return true;
709                 }
710             }
711             if (entry.nextEntryFileOffset == 0)
712                 break;
713             entryOffset = entry.nextEntryFileOffset;
714         }
715         return false;
716     }
717
718     bool write(const std::string& key, std::vector<char>& buf)
719     {
720         if (!f.is_open())
721         {
722             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
723             if (!f.is_open())
724             {
725                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
726                 if (!f.is_open())
727                 {
728                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
729                     return false;
730                 }
731             }
732         }
733
734         size_t fileSize = getFileSize();
735         if (fileSize == 0)
736         {
737             // Write header
738             seekWriteAbsolute(0);
739             writeUInt32((uint32_t)sourceSignatureSize_);
740             f.write(sourceSignature_, sourceSignatureSize_);
741             CV_Assert(!f.fail());
742
743             writeUInt32(MAX_ENTRIES);
744             memset(entryOffsets, 0, sizeof(entryOffsets));
745             f.write((char*)entryOffsets, sizeof(entryOffsets));
746             CV_Assert(!f.fail());
747             f.flush();
748             CV_Assert(!f.fail());
749             f.close();
750             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
751             CV_Assert(f.is_open());
752             fileSize = getFileSize();
753         }
754         seekReadAbsolute(0);
755
756         // bypass FileHeader
757         uint32_t fileSourceSignatureSize = readUInt32();
758         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
759         seekReadRelative(fileSourceSignatureSize);
760
761         uint32_t numberOfEntries = readUInt32();
762         CV_Assert(numberOfEntries > 0);
763         if (numberOfEntries != MAX_ENTRIES)
764         {
765             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
766             clearFile();
767             return false;
768         }
769         size_t tableEntriesOffset = (size_t)f.tellg();
770         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
771         CV_Assert(!f.fail());
772
773         uint32_t entryNum = getHash(key);
774
775         uint32_t entryOffset = entryOffsets[entryNum];
776         FileEntry entry;
777         while (entryOffset > 0)
778         {
779             seekReadAbsolute(entryOffset);
780             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
781             f.read((char*)&entry, sizeof(entry));
782             CV_Assert(!f.fail());
783             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
784             if (key.size() == entry.keySize)
785             {
786                 if (entry.keySize > 0)
787                 {
788                     f.read(fileKey.data(), entry.keySize);
789                     CV_Assert(!f.fail());
790                 }
791                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
792                 {
793                     // duplicate
794                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
795                     return false;
796                 }
797             }
798             if (entry.nextEntryFileOffset == 0)
799                 break;
800             entryOffset = entry.nextEntryFileOffset;
801         }
802         seekReadAbsolute(0);
803         if (entryOffset > 0)
804         {
805             seekWriteAbsolute(entryOffset);
806             entry.nextEntryFileOffset = (uint32_t)fileSize;
807             f.write((char*)&entry, sizeof(entry));
808             CV_Assert(!f.fail());
809         }
810         else
811         {
812             entryOffsets[entryNum] = (uint32_t)fileSize;
813             seekWriteAbsolute(tableEntriesOffset);
814             f.write((char*)entryOffsets, sizeof(entryOffsets));
815             CV_Assert(!f.fail());
816         }
817         seekWriteAbsolute(fileSize);
818         entry.nextEntryFileOffset = 0;
819         entry.dataSize = (uint32_t)buf.size();
820         entry.keySize = (uint32_t)key.size();
821         f.write((char*)&entry, sizeof(entry));
822         CV_Assert(!f.fail());
823         f.write(key.c_str(), entry.keySize);
824         CV_Assert(!f.fail());
825         f.write(&buf[0], entry.dataSize);
826         CV_Assert(!f.fail());
827         f.flush();
828         CV_Assert(!f.fail());
829         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
830         return true;
831     }
832 };
833 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
834
835
836
837 struct OpenCLExecutionContext::Impl
838 {
839     ocl::Context context_;
840     int device_;  // device index in context
841     ocl::Queue queue_;
842     int useOpenCL_;
843
844 protected:
845     Impl() = delete;
846
847     void _init_device(cl_device_id deviceID)
848     {
849         CV_Assert(deviceID);
850         int ndevices = (int)context_.ndevices();
851         CV_Assert(ndevices > 0);
852         bool found = false;
853         for (int i = 0; i < ndevices; i++)
854         {
855             ocl::Device d = context_.device(i);
856             cl_device_id dhandle = (cl_device_id)d.ptr();
857             if (dhandle == deviceID)
858             {
859                 device_ = i;
860                 found = true;
861                 break;
862             }
863         }
864         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
865     }
866
867     void _init_device(const ocl::Device& device)
868     {
869         CV_Assert(device.ptr());
870         int ndevices = (int)context_.ndevices();
871         CV_Assert(ndevices > 0);
872         bool found = false;
873         for (int i = 0; i < ndevices; i++)
874         {
875             ocl::Device d = context_.device(i);
876             if (d.getImpl() == device.getImpl())
877             {
878                 device_ = i;
879                 found = true;
880                 break;
881             }
882         }
883         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
884     }
885
886 public:
887     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
888         : device_(0), useOpenCL_(-1)
889     {
890         CV_UNUSED(platformID);
891         CV_Assert(context);
892         CV_Assert(deviceID);
893
894         context_ = Context::fromHandle(context);
895         _init_device(deviceID);
896         queue_ = Queue(context_, context_.device(device_));
897     }
898
899     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
900         : device_(0), useOpenCL_(-1)
901     {
902         CV_Assert(context.ptr());
903         CV_Assert(device.ptr());
904
905         context_ = context;
906         _init_device(device);
907         queue_ = queue;
908     }
909
910     Impl(const ocl::Context& context, const ocl::Device& device)
911         : device_(0), useOpenCL_(-1)
912     {
913         CV_Assert(context.ptr());
914         CV_Assert(device.ptr());
915
916         context_ = context;
917         _init_device(device);
918         queue_ = Queue(context_, context_.device(device_));
919     }
920
921     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
922         : context_(context)
923         , device_(device)
924         , queue_(queue)
925         , useOpenCL_(-1)
926     {
927         // nothing
928     }
929     Impl(const Impl& other)
930         : context_(other.context_)
931         , device_(other.device_)
932         , queue_(other.queue_)
933         , useOpenCL_(-1)
934     {
935         // nothing
936     }
937
938     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
939     bool useOpenCL()
940     {
941         if (useOpenCL_ < 0)
942         {
943             try
944             {
945                 useOpenCL_ = 0;
946                 if (!context_.empty() && context_.ndevices() > 0)
947                 {
948                     const Device& d = context_.device(device_);
949                     useOpenCL_ = d.available();
950                 }
951             }
952             catch (const cv::Exception&)
953             {
954                 // nothing
955             }
956             if (!useOpenCL_)
957                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
958         }
959         return useOpenCL_ > 0;
960     }
961
962     void setUseOpenCL(bool flag)
963     {
964         if (!flag)
965             useOpenCL_ = 0;
966         else
967             useOpenCL_ = -1;
968     }
969
970     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
971     {
972         CV_TRACE_FUNCTION();
973
974         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
975
976         static bool initialized = false;
977         static std::shared_ptr<Impl> g_primaryExecutionContext;
978
979         if (!initialized)
980         {
981             cv::AutoLock lock(getInitializationMutex());
982             if (!initialized)
983             {
984                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
985                 try
986                 {
987                     Context c = ocl::Context::create(std::string());
988                     if (c.ndevices())
989                     {
990                         int deviceId = 0;
991                         auto& d = c.device(deviceId);
992                         if (d.available())
993                         {
994                             auto q = ocl::Queue(c, d);
995                             if (!q.ptr())
996                             {
997                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
998                             }
999                             else
1000                             {
1001                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
1002                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
1003                             }
1004                         }
1005                         else
1006                         {
1007                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1008                         }
1009                     }
1010                     else
1011                     {
1012                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1013                     }
1014                 }
1015                 catch (const std::exception& e)
1016                 {
1017                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1018                 }
1019                 catch (...)
1020                 {
1021                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1022                 }
1023                 initialized = true;
1024             }
1025         }
1026         return g_primaryExecutionContext;
1027     }
1028 };
1029
1030 Context& OpenCLExecutionContext::getContext() const
1031 {
1032     CV_Assert(p);
1033     return p->context_;
1034 }
1035 Device& OpenCLExecutionContext::getDevice() const
1036 {
1037     CV_Assert(p);
1038     return p->context_.device(p->device_);
1039 }
1040 Queue& OpenCLExecutionContext::getQueue() const
1041 {
1042     CV_Assert(p);
1043     return p->queue_;
1044 }
1045
1046 bool OpenCLExecutionContext::useOpenCL() const
1047 {
1048     if (p)
1049         return p->useOpenCL();
1050     return false;
1051 }
1052 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1053 {
1054     CV_Assert(p);
1055     p->setUseOpenCL(flag);
1056 }
1057
1058 /* static */
1059 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1060 {
1061     CV_TRACE_FUNCTION();
1062     CoreTLSData& data = getCoreTlsData();
1063     OpenCLExecutionContext& c = data.oclExecutionContext;
1064     if (!data.oclExecutionContextInitialized)
1065     {
1066         data.oclExecutionContextInitialized = true;
1067         if (c.empty() && haveOpenCL())
1068             c.p = Impl::getInitializedExecutionContext();
1069     }
1070     return c;
1071 }
1072
1073 /* static */
1074 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1075 {
1076     CV_TRACE_FUNCTION();
1077     CoreTLSData& data = getCoreTlsData();
1078     OpenCLExecutionContext& c = data.oclExecutionContext;
1079     return c;
1080 }
1081
1082 void OpenCLExecutionContext::bind() const
1083 {
1084     CV_TRACE_FUNCTION();
1085     CV_Assert(p);
1086     CoreTLSData& data = getCoreTlsData();
1087     data.oclExecutionContext = *this;
1088     data.oclExecutionContextInitialized = true;
1089     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1090 }
1091
1092
1093 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1094 {
1095     CV_TRACE_FUNCTION();
1096     CV_Assert(p);
1097     const Queue q(getContext(), getDevice());
1098     return cloneWithNewQueue(q);
1099 }
1100
1101 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1102 {
1103     CV_TRACE_FUNCTION();
1104     CV_Assert(p);
1105     CV_Assert(q.ptr() != NULL);
1106     OpenCLExecutionContext c;
1107     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1108     return c;
1109 }
1110
1111 /* static */
1112 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1113 {
1114     CV_TRACE_FUNCTION();
1115     if (!haveOpenCL())
1116         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1117
1118     CV_Assert(!context.empty());
1119     CV_Assert(context.ptr());
1120     CV_Assert(!device.empty());
1121     CV_Assert(device.ptr());
1122     OpenCLExecutionContext ctx;
1123     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1124     return ctx;
1125
1126 }
1127
1128 /* static */
1129 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1130 {
1131     CV_TRACE_FUNCTION();
1132     if (!haveOpenCL())
1133         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1134
1135     CV_Assert(!context.empty());
1136     CV_Assert(context.ptr());
1137     CV_Assert(!device.empty());
1138     CV_Assert(device.ptr());
1139     OpenCLExecutionContext ctx;
1140     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1141     return ctx;
1142
1143 }
1144
1145 void OpenCLExecutionContext::release()
1146 {
1147     CV_TRACE_FUNCTION();
1148     p.reset();
1149 }
1150
1151
1152 // true if we have initialized OpenCL subsystem with available platforms
1153 static bool g_isOpenCVActivated = false;
1154
1155 bool haveOpenCL()
1156 {
1157     CV_TRACE_FUNCTION();
1158     static bool g_isOpenCLInitialized = false;
1159     static bool g_isOpenCLAvailable = false;
1160
1161     if (!g_isOpenCLInitialized)
1162     {
1163         CV_TRACE_REGION("Init_OpenCL_Runtime");
1164         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1165         if (envPath)
1166         {
1167             if (cv::String(envPath) == "disabled")
1168             {
1169                 g_isOpenCLAvailable = false;
1170                 g_isOpenCLInitialized = true;
1171                 return false;
1172             }
1173         }
1174
1175         cv::AutoLock lock(getInitializationMutex());
1176         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1177         try
1178         {
1179             cl_uint n = 0;
1180             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1181             g_isOpenCVActivated = n > 0;
1182             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1183         }
1184         catch (...)
1185         {
1186             g_isOpenCLAvailable = false;
1187         }
1188         g_isOpenCLInitialized = true;
1189     }
1190     return g_isOpenCLAvailable;
1191 }
1192
1193 bool useOpenCL()
1194 {
1195     CoreTLSData& data = getCoreTlsData();
1196     if (data.useOpenCL < 0)
1197     {
1198         try
1199         {
1200             data.useOpenCL = 0;
1201             if (haveOpenCL())
1202             {
1203                 auto c = OpenCLExecutionContext::getCurrent();
1204                 data.useOpenCL = c.useOpenCL();
1205             }
1206         }
1207         catch (...)
1208         {
1209             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1210         }
1211     }
1212     return data.useOpenCL > 0;
1213 }
1214
1215 bool isOpenCLActivated()
1216 {
1217     if (!g_isOpenCVActivated)
1218         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1219     return useOpenCL();
1220 }
1221
1222 void setUseOpenCL(bool flag)
1223 {
1224     CV_TRACE_FUNCTION();
1225
1226     CoreTLSData& data = getCoreTlsData();
1227     auto& c = OpenCLExecutionContext::getCurrentRef();
1228     if (!c.empty())
1229     {
1230         c.setUseOpenCL(flag);
1231         data.useOpenCL = c.useOpenCL();
1232     }
1233     else
1234     {
1235         if (!flag)
1236             data.useOpenCL = 0;
1237         else
1238             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1239     }
1240 }
1241
1242
1243
1244 #ifdef HAVE_CLAMDBLAS
1245
1246 class AmdBlasHelper
1247 {
1248 public:
1249     static AmdBlasHelper & getInstance()
1250     {
1251         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1252     }
1253
1254     bool isAvailable() const
1255     {
1256         return g_isAmdBlasAvailable;
1257     }
1258
1259     ~AmdBlasHelper()
1260     {
1261         try
1262         {
1263             clAmdBlasTeardown();
1264         }
1265         catch (...) { }
1266     }
1267
1268 protected:
1269     AmdBlasHelper()
1270     {
1271         if (!g_isAmdBlasInitialized)
1272         {
1273             AutoLock lock(getInitializationMutex());
1274
1275             if (!g_isAmdBlasInitialized)
1276             {
1277                 if (haveOpenCL())
1278                 {
1279                     try
1280                     {
1281                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1282                     }
1283                     catch (...)
1284                     {
1285                         g_isAmdBlasAvailable = false;
1286                     }
1287                 }
1288                 else
1289                     g_isAmdBlasAvailable = false;
1290
1291                 g_isAmdBlasInitialized = true;
1292             }
1293         }
1294     }
1295
1296 private:
1297     static bool g_isAmdBlasInitialized;
1298     static bool g_isAmdBlasAvailable;
1299 };
1300
1301 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1302 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1303
1304 bool haveAmdBlas()
1305 {
1306     return AmdBlasHelper::getInstance().isAvailable();
1307 }
1308
1309 #else
1310
1311 bool haveAmdBlas()
1312 {
1313     return false;
1314 }
1315
1316 #endif
1317
1318 #ifdef HAVE_CLAMDFFT
1319
1320 class AmdFftHelper
1321 {
1322 public:
1323     static AmdFftHelper & getInstance()
1324     {
1325         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1326     }
1327
1328     bool isAvailable() const
1329     {
1330         return g_isAmdFftAvailable;
1331     }
1332
1333     ~AmdFftHelper()
1334     {
1335         try
1336         {
1337 //            clAmdFftTeardown();
1338         }
1339         catch (...) { }
1340     }
1341
1342 protected:
1343     AmdFftHelper()
1344     {
1345         if (!g_isAmdFftInitialized)
1346         {
1347             AutoLock lock(getInitializationMutex());
1348
1349             if (!g_isAmdFftInitialized)
1350             {
1351                 if (haveOpenCL())
1352                 {
1353                     try
1354                     {
1355                         cl_uint major, minor, patch;
1356                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1357
1358                         // it throws exception in case AmdFft binaries are not found
1359                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1360                         g_isAmdFftAvailable = true;
1361                     }
1362                     catch (const Exception &)
1363                     {
1364                         g_isAmdFftAvailable = false;
1365                     }
1366                 }
1367                 else
1368                     g_isAmdFftAvailable = false;
1369
1370                 g_isAmdFftInitialized = true;
1371             }
1372         }
1373     }
1374
1375 private:
1376     static clAmdFftSetupData setupData;
1377     static bool g_isAmdFftInitialized;
1378     static bool g_isAmdFftAvailable;
1379 };
1380
1381 clAmdFftSetupData AmdFftHelper::setupData;
1382 bool AmdFftHelper::g_isAmdFftAvailable = false;
1383 bool AmdFftHelper::g_isAmdFftInitialized = false;
1384
1385 bool haveAmdFft()
1386 {
1387     return AmdFftHelper::getInstance().isAvailable();
1388 }
1389
1390 #else
1391
1392 bool haveAmdFft()
1393 {
1394     return false;
1395 }
1396
1397 #endif
1398
1399 bool haveSVM()
1400 {
1401 #ifdef HAVE_OPENCL_SVM
1402     return true;
1403 #else
1404     return false;
1405 #endif
1406 }
1407
1408 void finish()
1409 {
1410     Queue::getDefault().finish();
1411 }
1412
1413 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1414
1415 struct Platform::Impl
1416 {
1417     Impl()
1418     {
1419         refcount = 1;
1420         handle = 0;
1421         initialized = false;
1422     }
1423
1424     ~Impl() {}
1425
1426     void init()
1427     {
1428         if( !initialized )
1429         {
1430             //cl_uint num_entries
1431             cl_uint n = 0;
1432             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1433                 handle = 0;
1434             if( handle != 0 )
1435             {
1436                 char buf[1000];
1437                 size_t len = 0;
1438                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1439                 buf[len] = '\0';
1440                 vendor = String(buf);
1441             }
1442
1443             initialized = true;
1444         }
1445     }
1446
1447     IMPLEMENT_REFCOUNTABLE();
1448
1449     cl_platform_id handle;
1450     String vendor;
1451     bool initialized;
1452 };
1453
1454 Platform::Platform()
1455 {
1456     p = 0;
1457 }
1458
1459 Platform::~Platform()
1460 {
1461     if(p)
1462         p->release();
1463 }
1464
1465 Platform::Platform(const Platform& pl)
1466 {
1467     p = (Impl*)pl.p;
1468     if(p)
1469         p->addref();
1470 }
1471
1472 Platform& Platform::operator = (const Platform& pl)
1473 {
1474     Impl* newp = (Impl*)pl.p;
1475     if(newp)
1476         newp->addref();
1477     if(p)
1478         p->release();
1479     p = newp;
1480     return *this;
1481 }
1482
1483 void* Platform::ptr() const
1484 {
1485     return p ? p->handle : 0;
1486 }
1487
1488 Platform& Platform::getDefault()
1489 {
1490     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1491     static Platform p;
1492     if( !p.p )
1493     {
1494         p.p = new Impl;
1495         p.p->init();
1496     }
1497     return p;
1498 }
1499
1500 /////////////////////////////////////// Device ////////////////////////////////////////////
1501
1502 // deviceVersion has format
1503 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1504 // by specification
1505 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1506 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1507 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1508 {
1509     major = minor = 0;
1510     if (10 >= deviceVersion.length())
1511         return;
1512     const char *pstr = deviceVersion.c_str();
1513     if (0 != strncmp(pstr, "OpenCL ", 7))
1514         return;
1515     size_t ppos = deviceVersion.find('.', 7);
1516     if (String::npos == ppos)
1517         return;
1518     String temp = deviceVersion.substr(7, ppos - 7);
1519     major = atoi(temp.c_str());
1520     temp = deviceVersion.substr(ppos + 1);
1521     minor = atoi(temp.c_str());
1522 }
1523
1524 struct Device::Impl
1525 {
1526     Impl(void* d)
1527         : refcount(1)
1528         , handle(0)
1529     {
1530         try
1531         {
1532             cl_device_id device = (cl_device_id)d;
1533             _init(device);
1534             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1535         }
1536         catch (...)
1537         {
1538             throw;
1539         }
1540     }
1541
1542     void _init(cl_device_id d)
1543     {
1544         handle = (cl_device_id)d;
1545
1546         name_ = getStrProp(CL_DEVICE_NAME);
1547         version_ = getStrProp(CL_DEVICE_VERSION);
1548         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1549         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1550         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1551         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1552         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1553         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1554         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1555         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1556
1557         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1558         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1559
1560         size_t pos = 0;
1561         while (pos < extensions_.size())
1562         {
1563             size_t pos2 = extensions_.find(' ', pos);
1564             if (pos2 == String::npos)
1565                 pos2 = extensions_.size();
1566             if (pos2 > pos)
1567             {
1568                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1569                 extensions_set_.insert(extensionName);
1570             }
1571             pos = pos2 + 1;
1572         }
1573
1574         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1575
1576         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1577         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1578             vendorName_ == "AMD")
1579             vendorID_ = VENDOR_AMD;
1580         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1581             vendorID_ = VENDOR_INTEL;
1582         else if (vendorName_ == "NVIDIA Corporation")
1583             vendorID_ = VENDOR_NVIDIA;
1584         else
1585             vendorID_ = UNKNOWN_VENDOR;
1586
1587         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1588         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1589         {
1590             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1591             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1592                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1593             maxWorkGroupSize_ = new_maxWorkGroupSize;
1594         }
1595 #if 0
1596         if (isExtensionSupported("cl_khr_spir"))
1597         {
1598 #ifndef CL_DEVICE_SPIR_VERSIONS
1599 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1600 #endif
1601             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1602             std::cout << spir_versions << std::endl;
1603         }
1604 #endif
1605     }
1606
1607     ~Impl()
1608     {
1609 #ifdef _WIN32
1610         if (!cv::__termination)
1611 #endif
1612         {
1613             if (handle)
1614             {
1615                 CV_OCL_CHECK(clReleaseDevice(handle));
1616                 handle = 0;
1617             }
1618         }
1619     }
1620
1621     template<typename _TpCL, typename _TpOut>
1622     _TpOut getProp(cl_device_info prop) const
1623     {
1624         _TpCL temp=_TpCL();
1625         size_t sz = 0;
1626
1627         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1628             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1629     }
1630
1631     bool getBoolProp(cl_device_info prop) const
1632     {
1633         cl_bool temp = CL_FALSE;
1634         size_t sz = 0;
1635
1636         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1637             sz == sizeof(temp) ? temp != 0 : false;
1638     }
1639
1640     String getStrProp(cl_device_info prop) const
1641     {
1642         char buf[4096];
1643         size_t sz=0;
1644         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1645             sz < sizeof(buf) ? String(buf) : String();
1646     }
1647
1648     bool isExtensionSupported(const std::string& extensionName) const
1649     {
1650         return extensions_set_.count(extensionName) > 0;
1651     }
1652
1653
1654     IMPLEMENT_REFCOUNTABLE();
1655
1656     cl_device_id handle;
1657
1658     String name_;
1659     String version_;
1660     std::string extensions_;
1661     int doubleFPConfig_;
1662     bool hostUnifiedMemory_;
1663     int maxComputeUnits_;
1664     size_t maxWorkGroupSize_;
1665     int type_;
1666     int addressBits_;
1667     int deviceVersionMajor_;
1668     int deviceVersionMinor_;
1669     String driverVersion_;
1670     String vendorName_;
1671     int vendorID_;
1672     bool intelSubgroupsSupport_;
1673
1674     std::set<std::string> extensions_set_;
1675 };
1676
1677
1678 Device::Device()
1679 {
1680     p = 0;
1681 }
1682
1683 Device::Device(void* d)
1684 {
1685     p = 0;
1686     set(d);
1687 }
1688
1689 Device::Device(const Device& d)
1690 {
1691     p = d.p;
1692     if(p)
1693         p->addref();
1694 }
1695
1696 Device& Device::operator = (const Device& d)
1697 {
1698     Impl* newp = (Impl*)d.p;
1699     if(newp)
1700         newp->addref();
1701     if(p)
1702         p->release();
1703     p = newp;
1704     return *this;
1705 }
1706
1707 Device::~Device()
1708 {
1709     if(p)
1710         p->release();
1711 }
1712
1713 void Device::set(void* d)
1714 {
1715     if(p)
1716         p->release();
1717     p = new Impl(d);
1718     if (p->handle)
1719     {
1720         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1721     }
1722 }
1723
1724 Device Device::fromHandle(void* d)
1725 {
1726     Device device(d);
1727     return device;
1728 }
1729
1730 void* Device::ptr() const
1731 {
1732     return p ? p->handle : 0;
1733 }
1734
1735 String Device::name() const
1736 { return p ? p->name_ : String(); }
1737
1738 String Device::extensions() const
1739 { return p ? String(p->extensions_) : String(); }
1740
1741 bool Device::isExtensionSupported(const String& extensionName) const
1742 { return p ? p->isExtensionSupported(extensionName) : false; }
1743
1744 String Device::version() const
1745 { return p ? p->version_ : String(); }
1746
1747 String Device::vendorName() const
1748 { return p ? p->vendorName_ : String(); }
1749
1750 int Device::vendorID() const
1751 { return p ? p->vendorID_ : 0; }
1752
1753 String Device::OpenCL_C_Version() const
1754 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1755
1756 String Device::OpenCLVersion() const
1757 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1758
1759 int Device::deviceVersionMajor() const
1760 { return p ? p->deviceVersionMajor_ : 0; }
1761
1762 int Device::deviceVersionMinor() const
1763 { return p ? p->deviceVersionMinor_ : 0; }
1764
1765 String Device::driverVersion() const
1766 { return p ? p->driverVersion_ : String(); }
1767
1768 int Device::type() const
1769 { return p ? p->type_ : 0; }
1770
1771 int Device::addressBits() const
1772 { return p ? p->addressBits_ : 0; }
1773
1774 bool Device::available() const
1775 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1776
1777 bool Device::compilerAvailable() const
1778 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1779
1780 bool Device::linkerAvailable() const
1781 #ifdef CL_VERSION_1_2
1782 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1783 #else
1784 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1785 #endif
1786
1787 int Device::doubleFPConfig() const
1788 { return p ? p->doubleFPConfig_ : 0; }
1789
1790 int Device::singleFPConfig() const
1791 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1792
1793 int Device::halfFPConfig() const
1794 #ifdef CL_VERSION_1_2
1795 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1796 #else
1797 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1798 #endif
1799
1800 bool Device::endianLittle() const
1801 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1802
1803 bool Device::errorCorrectionSupport() const
1804 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1805
1806 int Device::executionCapabilities() const
1807 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1808
1809 size_t Device::globalMemCacheSize() const
1810 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1811
1812 int Device::globalMemCacheType() const
1813 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1814
1815 int Device::globalMemCacheLineSize() const
1816 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1817
1818 size_t Device::globalMemSize() const
1819 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1820
1821 size_t Device::localMemSize() const
1822 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1823
1824 int Device::localMemType() const
1825 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1826
1827 bool Device::hostUnifiedMemory() const
1828 { return p ? p->hostUnifiedMemory_ : false; }
1829
1830 bool Device::imageSupport() const
1831 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1832
1833 bool Device::imageFromBufferSupport() const
1834 {
1835     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1836 }
1837
1838 uint Device::imagePitchAlignment() const
1839 {
1840 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1841     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1842 #else
1843     return 0;
1844 #endif
1845 }
1846
1847 uint Device::imageBaseAddressAlignment() const
1848 {
1849 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1850     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1851 #else
1852     return 0;
1853 #endif
1854 }
1855
1856 size_t Device::image2DMaxWidth() const
1857 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1858
1859 size_t Device::image2DMaxHeight() const
1860 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1861
1862 size_t Device::image3DMaxWidth() const
1863 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1864
1865 size_t Device::image3DMaxHeight() const
1866 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1867
1868 size_t Device::image3DMaxDepth() const
1869 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1870
1871 size_t Device::imageMaxBufferSize() const
1872 #ifdef CL_VERSION_1_2
1873 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1874 #else
1875 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1876 #endif
1877
1878 size_t Device::imageMaxArraySize() const
1879 #ifdef CL_VERSION_1_2
1880 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1881 #else
1882 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1883 #endif
1884
1885 bool Device::intelSubgroupsSupport() const
1886 { return p ? p->intelSubgroupsSupport_ : false; }
1887
1888 int Device::maxClockFrequency() const
1889 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1890
1891 int Device::maxComputeUnits() const
1892 { return p ? p->maxComputeUnits_ : 0; }
1893
1894 int Device::maxConstantArgs() const
1895 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1896
1897 size_t Device::maxConstantBufferSize() const
1898 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1899
1900 size_t Device::maxMemAllocSize() const
1901 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1902
1903 size_t Device::maxParameterSize() const
1904 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1905
1906 int Device::maxReadImageArgs() const
1907 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1908
1909 int Device::maxWriteImageArgs() const
1910 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1911
1912 int Device::maxSamplers() const
1913 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1914
1915 size_t Device::maxWorkGroupSize() const
1916 { return p ? p->maxWorkGroupSize_ : 0; }
1917
1918 int Device::maxWorkItemDims() const
1919 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1920
1921 void Device::maxWorkItemSizes(size_t* sizes) const
1922 {
1923     if(p)
1924     {
1925         const int MAX_DIMS = 32;
1926         size_t retsz = 0;
1927         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1928                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1929     }
1930 }
1931
1932 int Device::memBaseAddrAlign() const
1933 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1934
1935 int Device::nativeVectorWidthChar() const
1936 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1937
1938 int Device::nativeVectorWidthShort() const
1939 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1940
1941 int Device::nativeVectorWidthInt() const
1942 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1943
1944 int Device::nativeVectorWidthLong() const
1945 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1946
1947 int Device::nativeVectorWidthFloat() const
1948 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1949
1950 int Device::nativeVectorWidthDouble() const
1951 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1952
1953 int Device::nativeVectorWidthHalf() const
1954 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1955
1956 int Device::preferredVectorWidthChar() const
1957 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1958
1959 int Device::preferredVectorWidthShort() const
1960 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1961
1962 int Device::preferredVectorWidthInt() const
1963 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1964
1965 int Device::preferredVectorWidthLong() const
1966 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1967
1968 int Device::preferredVectorWidthFloat() const
1969 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1970
1971 int Device::preferredVectorWidthDouble() const
1972 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1973
1974 int Device::preferredVectorWidthHalf() const
1975 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1976
1977 size_t Device::printfBufferSize() const
1978 #ifdef CL_VERSION_1_2
1979 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1980 #else
1981 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1982 #endif
1983
1984
1985 size_t Device::profilingTimerResolution() const
1986 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1987
1988 const Device& Device::getDefault()
1989 {
1990     auto& c = OpenCLExecutionContext::getCurrent();
1991     if (!c.empty())
1992     {
1993         return c.getDevice();
1994     }
1995
1996     static Device dummy;
1997     return dummy;
1998 }
1999
2000 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2001
2002 template <typename Functor, typename ObjectType>
2003 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2004 {
2005     ::size_t required;
2006     cl_int err = f(obj, name, 0, NULL, &required);
2007     if (err != CL_SUCCESS)
2008         return err;
2009
2010     param.clear();
2011     if (required > 0)
2012     {
2013         AutoBuffer<char> buf(required + 1);
2014         char* ptr = buf.data(); // cleanup is not needed
2015         err = f(obj, name, required, ptr, NULL);
2016         if (err != CL_SUCCESS)
2017             return err;
2018         param = ptr;
2019     }
2020
2021     return CL_SUCCESS;
2022 }
2023
2024 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2025 {
2026     elems.clear();
2027     if (s.size() == 0)
2028         return;
2029     std::istringstream ss(s);
2030     std::string item;
2031     while (!ss.eof())
2032     {
2033         std::getline(ss, item, delim);
2034         elems.push_back(item);
2035     }
2036 }
2037
2038 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2039 // Sample: AMD:GPU:
2040 // Sample: AMD:GPU:Tahiti
2041 // Sample: :GPU|CPU: = '' = ':' = '::'
2042 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2043         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2044 {
2045     std::vector<std::string> parts;
2046     split(configurationStr, ':', parts);
2047     if (parts.size() > 3)
2048     {
2049         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2050         return false;
2051     }
2052     if (parts.size() > 2)
2053         deviceNameOrID = parts[2];
2054     if (parts.size() > 1)
2055     {
2056         split(parts[1], '|', deviceTypes);
2057     }
2058     if (parts.size() > 0)
2059     {
2060         platform = parts[0];
2061     }
2062     return true;
2063 }
2064
2065 #if defined WINRT || defined _WIN32_WCE
2066 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2067 {
2068     CV_UNUSED(configuration)
2069     return NULL;
2070 }
2071 #else
2072 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2073 {
2074     std::string platform, deviceName;
2075     std::vector<std::string> deviceTypes;
2076
2077     if (!configuration)
2078         configuration = getenv("OPENCV_OPENCL_DEVICE");
2079
2080     if (configuration &&
2081             (strcmp(configuration, "disabled") == 0 ||
2082              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2083             ))
2084         return NULL;
2085
2086     bool isID = false;
2087     int deviceID = -1;
2088     if (deviceName.length() == 1)
2089     // We limit ID range to 0..9, because we want to write:
2090     // - '2500' to mean i5-2500
2091     // - '8350' to mean AMD FX-8350
2092     // - '650' to mean GeForce 650
2093     // To extend ID range change condition to '> 0'
2094     {
2095         isID = true;
2096         for (size_t i = 0; i < deviceName.length(); i++)
2097         {
2098             if (!isdigit(deviceName[i]))
2099             {
2100                 isID = false;
2101                 break;
2102             }
2103         }
2104         if (isID)
2105         {
2106             deviceID = atoi(deviceName.c_str());
2107             if (deviceID < 0)
2108                 return NULL;
2109         }
2110     }
2111
2112     std::vector<cl_platform_id> platforms;
2113     {
2114         cl_uint numPlatforms = 0;
2115         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2116
2117         if (numPlatforms == 0)
2118             return NULL;
2119         platforms.resize((size_t)numPlatforms);
2120         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2121         platforms.resize(numPlatforms);
2122     }
2123
2124     int selectedPlatform = -1;
2125     if (platform.length() > 0)
2126     {
2127         for (size_t i = 0; i < platforms.size(); i++)
2128         {
2129             std::string name;
2130             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2131             if (name.find(platform) != std::string::npos)
2132             {
2133                 selectedPlatform = (int)i;
2134                 break;
2135             }
2136         }
2137         if (selectedPlatform == -1)
2138         {
2139             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2140             goto not_found;
2141         }
2142     }
2143     if (deviceTypes.size() == 0)
2144     {
2145         if (!isID)
2146         {
2147             deviceTypes.push_back("GPU");
2148             if (configuration)
2149                 deviceTypes.push_back("CPU");
2150         }
2151         else
2152             deviceTypes.push_back("ALL");
2153     }
2154     for (size_t t = 0; t < deviceTypes.size(); t++)
2155     {
2156         int deviceType = 0;
2157         std::string tempStrDeviceType = deviceTypes[t];
2158         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2159
2160         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2161             deviceType = Device::TYPE_GPU;
2162         else if (tempStrDeviceType == "cpu")
2163             deviceType = Device::TYPE_CPU;
2164         else if (tempStrDeviceType == "accelerator")
2165             deviceType = Device::TYPE_ACCELERATOR;
2166         else if (tempStrDeviceType == "all")
2167             deviceType = Device::TYPE_ALL;
2168         else
2169         {
2170             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2171             goto not_found;
2172         }
2173
2174         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2175         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2176                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2177                 i++)
2178         {
2179             cl_uint count = 0;
2180             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2181             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2182             {
2183                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2184             }
2185             if (count == 0)
2186                 continue;
2187             size_t base = devices.size();
2188             devices.resize(base + count);
2189             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2190             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2191             {
2192                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2193             }
2194         }
2195
2196         for (size_t i = (isID ? deviceID : 0);
2197              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2198              i++)
2199         {
2200             std::string name;
2201             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2202             cl_bool useGPU = true;
2203             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2204             {
2205                 cl_bool isIGPU = CL_FALSE;
2206                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2207                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2208             }
2209             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2210             {
2211                 // TODO check for OpenCL 1.1
2212                 return devices[i];
2213             }
2214         }
2215     }
2216
2217 not_found:
2218     if (!configuration)
2219         return NULL; // suppress messages on stderr
2220
2221     std::ostringstream msg;
2222     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2223         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2224         << "    Device types:";
2225     for (size_t t = 0; t < deviceTypes.size(); t++)
2226         msg << ' ' << deviceTypes[t];
2227
2228     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2229
2230     CV_LOG_ERROR(NULL, msg.str());
2231     return NULL;
2232 }
2233 #endif
2234
2235 #ifdef HAVE_OPENCL_SVM
2236 namespace svm {
2237
2238 enum AllocatorFlags { // don't use first 16 bits
2239         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2240         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2241         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2242         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2243         OPENCL_SVM_BUFFER_MAP = 4 << 16
2244 };
2245
2246 static bool checkForceSVMUmatUsage()
2247 {
2248     static bool initialized = false;
2249     static bool force = false;
2250     if (!initialized)
2251     {
2252         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2253         initialized = true;
2254     }
2255     return force;
2256 }
2257 static bool checkDisableSVMUMatUsage()
2258 {
2259     static bool initialized = false;
2260     static bool force = false;
2261     if (!initialized)
2262     {
2263         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2264         initialized = true;
2265     }
2266     return force;
2267 }
2268 static bool checkDisableSVM()
2269 {
2270     static bool initialized = false;
2271     static bool force = false;
2272     if (!initialized)
2273     {
2274         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2275         initialized = true;
2276     }
2277     return force;
2278 }
2279 // see SVMCapabilities
2280 static unsigned int getSVMCapabilitiesMask()
2281 {
2282     static bool initialized = false;
2283     static unsigned int mask = 0;
2284     if (!initialized)
2285     {
2286         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2287         if (envValue == NULL)
2288         {
2289             return ~0U; // all bits 1
2290         }
2291         mask = atoi(envValue);
2292         initialized = true;
2293     }
2294     return mask;
2295 }
2296 } // namespace
2297 #endif
2298
2299 static size_t getProgramCountLimit()
2300 {
2301     static bool initialized = false;
2302     static size_t count = 0;
2303     if (!initialized)
2304     {
2305         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2306         initialized = true;
2307     }
2308     return count;
2309 }
2310
2311 static int g_contextId = 0;
2312
2313 class OpenCLBufferPoolImpl;
2314 class OpenCLSVMBufferPoolImpl;
2315
2316 struct Context::Impl
2317 {
2318     static Context::Impl* get(Context& context) { return context.p; }
2319
2320     typedef std::deque<Context::Impl*> container_t;
2321     static container_t& getGlobalContainer()
2322     {
2323         // never delete this container (Impl lifetime is greater due to TLS storage)
2324         static container_t* g_contexts = new container_t();
2325         return *g_contexts;
2326     }
2327
2328 protected:
2329     Impl(const std::string& configuration_)
2330         : refcount(1)
2331         , contextId(CV_XADD(&g_contextId, 1))
2332         , configuration(configuration_)
2333         , handle(0)
2334 #ifdef HAVE_DIRECTX
2335         , p_directx_impl(0)
2336 #endif
2337 #ifdef HAVE_OPENCL_SVM
2338         , svmInitialized(false)
2339 #endif
2340     {
2341         if (!haveOpenCL())
2342             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2343
2344         cv::AutoLock lock(cv::getInitializationMutex());
2345         auto& container = getGlobalContainer();
2346         container.resize(std::max(container.size(), (size_t)contextId + 1));
2347         container[contextId] = this;
2348     }
2349
2350     ~Impl()
2351     {
2352 #ifdef _WIN32
2353         if (!cv::__termination)
2354 #endif
2355         {
2356             if (handle)
2357             {
2358                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2359                 handle = NULL;
2360             }
2361             devices.clear();
2362 #ifdef HAVE_DIRECTX
2363             directx::internal::deleteDirectXImpl(&p_directx_impl);
2364 #endif
2365         }
2366
2367         {
2368             cv::AutoLock lock(cv::getInitializationMutex());
2369             auto& container = getGlobalContainer();
2370             CV_CheckLT((size_t)contextId, container.size(), "");
2371             container[contextId] = NULL;
2372         }
2373     }
2374
2375     void init_device_list()
2376     {
2377         CV_Assert(handle);
2378
2379         cl_uint ndevices = 0;
2380         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2381         CV_Assert(ndevices > 0);
2382
2383         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2384         size_t devices_ret_size = 0;
2385         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2386         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2387
2388         devices.clear();
2389         for (unsigned i = 0; i < ndevices; i++)
2390         {
2391             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2392         }
2393     }
2394
2395     void __init_buffer_pools();  // w/o synchronization
2396     void _init_buffer_pools() const
2397     {
2398         if (!bufferPool_)
2399         {
2400             cv::AutoLock lock(cv::getInitializationMutex());
2401             if (!bufferPool_)
2402             {
2403                 const_cast<Impl*>(this)->__init_buffer_pools();
2404             }
2405         }
2406     }
2407 public:
2408     static Impl* findContext(const std::string& configuration)
2409     {
2410         CV_TRACE_FUNCTION();
2411         cv::AutoLock lock(cv::getInitializationMutex());
2412         auto& container = getGlobalContainer();
2413         if (configuration.empty() && !container.empty())
2414             return container[0];
2415         for (auto it = container.begin(); it != container.end(); ++it)
2416         {
2417             Impl* i = *it;
2418             if (i && i->configuration == configuration)
2419             {
2420                 return i;
2421             }
2422         }
2423         return NULL;
2424     }
2425
2426     static Impl* findOrCreateContext(const std::string& configuration_)
2427     {
2428         CV_TRACE_FUNCTION();
2429         std::string configuration = configuration_;
2430         if (configuration_.empty())
2431         {
2432             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2433             if (c)
2434                 configuration = c;
2435         }
2436         Impl* impl = findContext(configuration);
2437         if (impl)
2438         {
2439             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2440             return impl;
2441         }
2442
2443         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2444         if (d == NULL)
2445             return NULL;
2446
2447         impl = new Impl(configuration);
2448         try
2449         {
2450             impl->createFromDevice(d);
2451             if (impl->handle)
2452                 return impl;
2453             delete impl;
2454             return NULL;
2455         }
2456         catch (...)
2457         {
2458             delete impl;
2459             throw;
2460         }
2461     }
2462
2463     static Impl* findOrCreateContext(cl_context h)
2464     {
2465         CV_TRACE_FUNCTION();
2466
2467         CV_Assert(h);
2468
2469         std::string configuration = cv::format("@ctx-%p", (void*)h);
2470         Impl* impl = findContext(configuration);
2471         if (impl)
2472         {
2473             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2474             impl->addref();
2475             return impl;
2476         }
2477
2478         impl = new Impl(configuration);
2479         try
2480         {
2481             CV_OCL_CHECK(clRetainContext(h));
2482             impl->handle = h;
2483             impl->init_device_list();
2484             return impl;
2485         }
2486         catch (...)
2487         {
2488             delete impl;
2489             throw;
2490         }
2491     }
2492
2493     static Impl* findOrCreateContext(const ocl::Device& device)
2494     {
2495         CV_TRACE_FUNCTION();
2496
2497         CV_Assert(!device.empty());
2498         cl_device_id d = (cl_device_id)device.ptr();
2499         CV_Assert(d);
2500
2501         std::string configuration = cv::format("@dev-%p", (void*)d);
2502         Impl* impl = findContext(configuration);
2503         if (impl)
2504         {
2505             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2506             impl->addref();
2507             return impl;
2508         }
2509
2510         impl = new Impl(configuration);
2511         try
2512         {
2513             impl->createFromDevice(d);
2514             CV_Assert(impl->handle);
2515             return impl;
2516         }
2517         catch (...)
2518         {
2519             delete impl;
2520             throw;
2521         }
2522     }
2523
2524     void setDefault()
2525     {
2526         CV_TRACE_FUNCTION();
2527         cl_device_id d = selectOpenCLDevice();
2528
2529         if (d == NULL)
2530             return;
2531
2532         createFromDevice(d);
2533     }
2534
2535     void createFromDevice(cl_device_id d)
2536     {
2537         CV_TRACE_FUNCTION();
2538         CV_Assert(handle == NULL);
2539
2540         cl_platform_id pl = NULL;
2541         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2542
2543         cl_context_properties prop[] =
2544         {
2545             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2546             0
2547         };
2548
2549         // !!! in the current implementation force the number of devices to 1 !!!
2550         cl_uint nd = 1;
2551         cl_int status;
2552
2553         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2554         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2555
2556         bool ok = handle != 0 && status == CL_SUCCESS;
2557         if( ok )
2558         {
2559             devices.resize(nd);
2560             devices[0].set(d);
2561         }
2562         else
2563             handle = NULL;
2564     }
2565
2566     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2567
2568     void unloadProg(Program& prog)
2569     {
2570         cv::AutoLock lock(program_cache_mutex);
2571         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2572         {
2573               phash_t::iterator it = phash.find(*i);
2574               if (it != phash.end())
2575               {
2576                   if (it->second.ptr() == prog.ptr())
2577                   {
2578                       phash.erase(*i);
2579                       cacheList.erase(i);
2580                       return;
2581                   }
2582               }
2583         }
2584     }
2585
2586     std::string& getPrefixString()
2587     {
2588         if (prefix.empty())
2589         {
2590             cv::AutoLock lock(program_cache_mutex);
2591             if (prefix.empty())
2592             {
2593                 CV_Assert(!devices.empty());
2594                 const Device& d = devices[0];
2595                 int bits = d.addressBits();
2596                 if (bits > 0 && bits != 64)
2597                     prefix = cv::format("%d-bit--", bits);
2598                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2599                 // sanitize chars
2600                 for (size_t i = 0; i < prefix.size(); i++)
2601                 {
2602                     char c = prefix[i];
2603                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2604                     {
2605                         prefix[i] = '_';
2606                     }
2607                 }
2608             }
2609         }
2610         return prefix;
2611     }
2612
2613     std::string& getPrefixBase()
2614     {
2615         if (prefix_base.empty())
2616         {
2617             cv::AutoLock lock(program_cache_mutex);
2618             if (prefix_base.empty())
2619             {
2620                 const Device& d = devices[0];
2621                 int bits = d.addressBits();
2622                 if (bits > 0 && bits != 64)
2623                     prefix_base = cv::format("%d-bit--", bits);
2624                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2625                 // sanitize chars
2626                 for (size_t i = 0; i < prefix_base.size(); i++)
2627                 {
2628                     char c = prefix_base[i];
2629                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2630                     {
2631                         prefix_base[i] = '_';
2632                     }
2633                 }
2634             }
2635         }
2636         return prefix_base;
2637     }
2638
2639     IMPLEMENT_REFCOUNTABLE();
2640
2641     const int contextId;  // global unique ID
2642     const std::string configuration;
2643
2644     cl_context handle;
2645     std::vector<Device> devices;
2646
2647     std::string prefix;
2648     std::string prefix_base;
2649
2650     cv::Mutex program_cache_mutex;
2651     typedef std::map<std::string, Program> phash_t;
2652     phash_t phash;
2653     typedef std::list<cv::String> CacheList;
2654     CacheList cacheList;
2655
2656     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2657     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2658     OpenCLBufferPoolImpl& getBufferPool() const
2659     {
2660         _init_buffer_pools();
2661         CV_DbgAssert(bufferPool_);
2662         return *bufferPool_.get();
2663     }
2664     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2665     {
2666         _init_buffer_pools();
2667         CV_DbgAssert(bufferPoolHostPtr_);
2668         return *bufferPoolHostPtr_.get();
2669     }
2670
2671 #ifdef HAVE_DIRECTX
2672     directx::internal::OpenCLDirectXImpl* p_directx_impl;
2673
2674     directx::internal::OpenCLDirectXImpl* getDirectXImpl()
2675     {
2676         if (!p_directx_impl)
2677         {
2678             p_directx_impl = directx::internal::createDirectXImpl();
2679         }
2680         return p_directx_impl;
2681     }
2682 #endif
2683
2684 #ifdef HAVE_OPENCL_SVM
2685     bool svmInitialized;
2686     bool svmAvailable;
2687     bool svmEnabled;
2688     svm::SVMCapabilities svmCapabilities;
2689     svm::SVMFunctions svmFunctions;
2690
2691     void svmInit()
2692     {
2693         CV_Assert(handle != NULL);
2694         const Device& device = devices[0];
2695         cl_device_svm_capabilities deviceCaps = 0;
2696         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2697         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2698         if (status != CL_SUCCESS)
2699         {
2700             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2701             goto noSVM;
2702         }
2703         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2704         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2705         svmCapabilities.value_ =
2706                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2707                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2708                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2709                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2710         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2711         if (svmCapabilities.value_ == 0)
2712         {
2713             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2714             goto noSVM;
2715         }
2716         try
2717         {
2718             // Try OpenCL 2.0
2719             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2720             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2721             if (!ptr)
2722             {
2723                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2724                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2725             }
2726             try
2727             {
2728                 bool error = false;
2729                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2730                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2731                 {
2732                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2733                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2734                 }
2735                 clFinish(q);
2736                 try
2737                 {
2738                     ((int*)ptr)[0] = 100;
2739                 }
2740                 catch (...)
2741                 {
2742                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2743                     error = true;
2744                 }
2745                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2746                 {
2747                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2748                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2749                 }
2750                 clFinish(q);
2751                 if (error)
2752                 {
2753                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2754                 }
2755             }
2756             catch (...)
2757             {
2758                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2759                 clSVMFree(handle, ptr);
2760                 throw;
2761             }
2762             clSVMFree(handle, ptr);
2763             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2764             svmFunctions.fn_clSVMFree = clSVMFree;
2765             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2766             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2767             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2768             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2769             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2770             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2771             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2772         }
2773         catch (...)
2774         {
2775             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2776             try
2777             {
2778                 // Try HSA extension
2779                 String extensions = device.extensions();
2780                 if (extensions.find("cl_amd_svm") == String::npos)
2781                 {
2782                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2783                     goto noSVM;
2784                 }
2785                 cl_platform_id p = NULL;
2786                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2787                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2788                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2789                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2790                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2791                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2792                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2793                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2794                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2795                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2796                 CV_Assert(svmFunctions.isValid());
2797             }
2798             catch (...)
2799             {
2800                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2801                 goto noSVM;
2802             }
2803         }
2804
2805         svmAvailable = true;
2806         svmEnabled = !svm::checkDisableSVM();
2807         svmInitialized = true;
2808         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2809         return;
2810     noSVM:
2811         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2812         svmAvailable = false;
2813         svmEnabled = false;
2814         svmCapabilities.value_ = 0;
2815         svmInitialized = true;
2816         svmFunctions.fn_clSVMAlloc = NULL;
2817         return;
2818     }
2819
2820     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2821
2822     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2823     {
2824         _init_buffer_pools();
2825         CV_DbgAssert(bufferPoolSVM_);
2826         return *bufferPoolSVM_.get();
2827     }
2828 #endif
2829
2830     friend class Program;
2831 };
2832
2833
2834 Context::Context()
2835 {
2836     p = 0;
2837 }
2838
2839 Context::~Context()
2840 {
2841     release();
2842 }
2843
2844 // deprecated
2845 Context::Context(int dtype)
2846 {
2847     p = 0;
2848     create(dtype);
2849 }
2850
2851 void Context::release()
2852 {
2853     if (p)
2854     {
2855         p->release();
2856         p = NULL;
2857     }
2858 }
2859
2860 bool Context::create()
2861 {
2862     release();
2863     if (!haveOpenCL())
2864         return false;
2865     p = Impl::findOrCreateContext(std::string());
2866     if (p && p->handle)
2867         return true;
2868     release();
2869     return false;
2870 }
2871
2872 // deprecated
2873 bool Context::create(int dtype)
2874 {
2875     if( !haveOpenCL() )
2876         return false;
2877     release();
2878     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2879     {
2880         p = Impl::findOrCreateContext("");
2881     }
2882     else if (dtype == CL_DEVICE_TYPE_GPU)
2883     {
2884         p = Impl::findOrCreateContext(":GPU:");
2885     }
2886     else if (dtype == CL_DEVICE_TYPE_CPU)
2887     {
2888         p = Impl::findOrCreateContext(":CPU:");
2889     }
2890     else
2891     {
2892         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2893     }
2894     if (p && !p->handle)
2895     {
2896         release();
2897     }
2898     return p != 0;
2899 }
2900
2901 Context::Context(const Context& c)
2902 {
2903     p = (Impl*)c.p;
2904     if(p)
2905         p->addref();
2906 }
2907
2908 Context& Context::operator = (const Context& c)
2909 {
2910     Impl* newp = (Impl*)c.p;
2911     if(newp)
2912         newp->addref();
2913     if(p)
2914         p->release();
2915     p = newp;
2916     return *this;
2917 }
2918
2919 void* Context::ptr() const
2920 {
2921     return p == NULL ? NULL : p->handle;
2922 }
2923
2924 size_t Context::ndevices() const
2925 {
2926     return p ? p->devices.size() : 0;
2927 }
2928
2929 Device& Context::device(size_t idx) const
2930 {
2931     static Device dummy;
2932     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2933 }
2934
2935 Context& Context::getDefault(bool initialize)
2936 {
2937     auto& c = OpenCLExecutionContext::getCurrent();
2938     if (!c.empty())
2939     {
2940         auto& ctx = c.getContext();
2941         return ctx;
2942     }
2943
2944     CV_UNUSED(initialize);
2945     static Context dummy;
2946     return dummy;
2947 }
2948
2949 Program Context::getProg(const ProgramSource& prog,
2950                          const String& buildopts, String& errmsg)
2951 {
2952     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2953 }
2954
2955 void Context::unloadProg(Program& prog)
2956 {
2957     if (p)
2958         p->unloadProg(prog);
2959 }
2960
2961 /* static */
2962 Context Context::fromHandle(void* context)
2963 {
2964     Context ctx;
2965     ctx.p = Impl::findOrCreateContext((cl_context)context);
2966     return ctx;
2967 }
2968
2969 /* static */
2970 Context Context::fromDevice(const ocl::Device& device)
2971 {
2972     Context ctx;
2973     ctx.p = Impl::findOrCreateContext(device);
2974     return ctx;
2975 }
2976
2977 /* static */
2978 Context Context::create(const std::string& configuration)
2979 {
2980     Context ctx;
2981     ctx.p = Impl::findOrCreateContext(configuration);
2982     return ctx;
2983 }
2984
2985 #ifdef HAVE_OPENCL_SVM
2986 bool Context::useSVM() const
2987 {
2988     Context::Impl* i = p;
2989     CV_Assert(i);
2990     if (!i->svmInitialized)
2991         i->svmInit();
2992     return i->svmEnabled;
2993 }
2994 void Context::setUseSVM(bool enabled)
2995 {
2996     Context::Impl* i = p;
2997     CV_Assert(i);
2998     if (!i->svmInitialized)
2999         i->svmInit();
3000     if (enabled && !i->svmAvailable)
3001     {
3002         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
3003     }
3004     i->svmEnabled = enabled;
3005 }
3006 #else
3007 bool Context::useSVM() const { return false; }
3008 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
3009 #endif
3010
3011 #ifdef HAVE_OPENCL_SVM
3012 namespace svm {
3013
3014 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
3015 {
3016     Context::Impl* i = context.p;
3017     CV_Assert(i);
3018     if (!i->svmInitialized)
3019         i->svmInit();
3020     return i->svmCapabilities;
3021 }
3022
3023 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3024 {
3025     Context::Impl* i = context.p;
3026     CV_Assert(i);
3027     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3028     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3029     return &i->svmFunctions;
3030 }
3031
3032 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3033 {
3034     if (checkForceSVMUmatUsage())
3035         return true;
3036     if (checkDisableSVMUMatUsage())
3037         return false;
3038     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3039         return true;
3040     return false; // don't use SVM by default
3041 }
3042
3043 } // namespace cv::ocl::svm
3044 #endif // HAVE_OPENCL_SVM
3045
3046
3047 static void get_platform_name(cl_platform_id id, String& name)
3048 {
3049     // get platform name string length
3050     size_t sz = 0;
3051     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3052
3053     // get platform name string
3054     AutoBuffer<char> buf(sz + 1);
3055     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3056
3057     // just in case, ensure trailing zero for ASCIIZ string
3058     buf[sz] = 0;
3059
3060     name = buf.data();
3061 }
3062
3063 /*
3064 // Attaches OpenCL context to OpenCV
3065 */
3066 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3067 {
3068     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3069     ctx.bind();
3070 }
3071
3072 /* static */
3073 OpenCLExecutionContext OpenCLExecutionContext::create(
3074         const std::string& platformName, void* platformID, void* context, void* deviceID
3075 )
3076 {
3077     if (!haveOpenCL())
3078         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3079
3080     cl_uint cnt = 0;
3081     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3082
3083     if (cnt == 0)
3084         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3085
3086     std::vector<cl_platform_id> platforms(cnt);
3087
3088     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3089
3090     bool platformAvailable = false;
3091
3092     // check if external platformName contained in list of available platforms in OpenCV
3093     for (unsigned int i = 0; i < cnt; i++)
3094     {
3095         String availablePlatformName;
3096         get_platform_name(platforms[i], availablePlatformName);
3097         // external platform is found in the list of available platforms
3098         if (platformName == availablePlatformName)
3099         {
3100             platformAvailable = true;
3101             break;
3102         }
3103     }
3104
3105     if (!platformAvailable)
3106         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3107
3108     // check if platformID corresponds to platformName
3109     String actualPlatformName;
3110     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3111     if (platformName != actualPlatformName)
3112         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3113
3114     OpenCLExecutionContext ctx;
3115     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3116     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3117     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3118     return ctx;
3119 }
3120
3121 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3122 {
3123     // internal call, less checks
3124     cl_platform_id platformID = (cl_platform_id)_platform;
3125     cl_context context = (cl_context)_context;
3126     cl_device_id deviceID = (cl_device_id)_device;
3127
3128     std::string platformName = PlatformInfo(&platformID).name();
3129
3130     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3131     CV_Assert(!clExecCtx.empty());
3132     ctx = clExecCtx.getContext();
3133 }
3134
3135 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3136
3137 struct Queue::Impl
3138 {
3139     inline void __init()
3140     {
3141         refcount = 1;
3142         handle = 0;
3143         isProfilingQueue_ = false;
3144     }
3145
3146     Impl(cl_command_queue q)
3147     {
3148         __init();
3149         handle = q;
3150
3151         cl_command_queue_properties props = 0;
3152         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3153         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3154     }
3155
3156     Impl(cl_command_queue q, bool isProfilingQueue)
3157     {
3158         __init();
3159         handle = q;
3160         isProfilingQueue_ = isProfilingQueue;
3161     }
3162
3163     Impl(const Context& c, const Device& d, bool withProfiling = false)
3164     {
3165         __init();
3166
3167         const Context* pc = &c;
3168         cl_context ch = (cl_context)pc->ptr();
3169         if( !ch )
3170         {
3171             pc = &Context::getDefault();
3172             ch = (cl_context)pc->ptr();
3173         }
3174         cl_device_id dh = (cl_device_id)d.ptr();
3175         if( !dh )
3176             dh = (cl_device_id)pc->device(0).ptr();
3177         cl_int retval = 0;
3178         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3179         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3180         isProfilingQueue_ = withProfiling;
3181     }
3182
3183     ~Impl()
3184     {
3185 #ifdef _WIN32
3186         if (!cv::__termination)
3187 #endif
3188         {
3189             if(handle)
3190             {
3191                 CV_OCL_DBG_CHECK(clFinish(handle));
3192                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3193                 handle = NULL;
3194             }
3195         }
3196     }
3197
3198     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3199     {
3200         if (isProfilingQueue_)
3201             return self;
3202
3203         if (profiling_queue_.ptr())
3204             return profiling_queue_;
3205
3206         cl_context ctx = 0;
3207         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3208
3209         cl_device_id device = 0;
3210         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3211
3212         cl_int result = CL_SUCCESS;
3213         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3214         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3215         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3216
3217         Queue queue;
3218         queue.p = new Impl(q, true);
3219         profiling_queue_ = queue;
3220
3221         return profiling_queue_;
3222     }
3223
3224     IMPLEMENT_REFCOUNTABLE();
3225
3226     cl_command_queue handle;
3227     bool isProfilingQueue_;
3228     cv::ocl::Queue profiling_queue_;
3229 };
3230
3231 Queue::Queue()
3232 {
3233     p = 0;
3234 }
3235
3236 Queue::Queue(const Context& c, const Device& d)
3237 {
3238     p = 0;
3239     create(c, d);
3240 }
3241
3242 Queue::Queue(const Queue& q)
3243 {
3244     p = q.p;
3245     if(p)
3246         p->addref();
3247 }
3248
3249 Queue& Queue::operator = (const Queue& q)
3250 {
3251     Impl* newp = (Impl*)q.p;
3252     if(newp)
3253         newp->addref();
3254     if(p)
3255         p->release();
3256     p = newp;
3257     return *this;
3258 }
3259
3260 Queue::~Queue()
3261 {
3262     if(p)
3263         p->release();
3264 }
3265
3266 bool Queue::create(const Context& c, const Device& d)
3267 {
3268     if(p)
3269         p->release();
3270     p = new Impl(c, d);
3271     return p->handle != 0;
3272 }
3273
3274 void Queue::finish()
3275 {
3276     if(p && p->handle)
3277     {
3278         CV_OCL_DBG_CHECK(clFinish(p->handle));
3279     }
3280 }
3281
3282 const Queue& Queue::getProfilingQueue() const
3283 {
3284     CV_Assert(p);
3285     return p->getProfilingQueue(*this);
3286 }
3287
3288 void* Queue::ptr() const
3289 {
3290     return p ? p->handle : 0;
3291 }
3292
3293 Queue& Queue::getDefault()
3294 {
3295     auto& c = OpenCLExecutionContext::getCurrent();
3296     if (!c.empty())
3297     {
3298         auto& q = c.getQueue();
3299         return q;
3300     }
3301     static Queue dummy;
3302     return dummy;
3303 }
3304
3305 static cl_command_queue getQueue(const Queue& q)
3306 {
3307     cl_command_queue qq = (cl_command_queue)q.ptr();
3308     if(!qq)
3309         qq = (cl_command_queue)Queue::getDefault().ptr();
3310     return qq;
3311 }
3312
3313 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3314
3315 KernelArg::KernelArg()
3316     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3317 {
3318 }
3319
3320 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3321     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3322 {
3323     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3324 }
3325
3326 KernelArg KernelArg::Constant(const Mat& m)
3327 {
3328     CV_Assert(m.isContinuous());
3329     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3330 }
3331
3332 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3333
3334 struct Kernel::Impl
3335 {
3336     Impl(const char* kname, const Program& prog) :
3337         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3338     {
3339         cl_program ph = (cl_program)prog.ptr();
3340         cl_int retval = 0;
3341         name = kname;
3342         if (ph)
3343         {
3344             handle = clCreateKernel(ph, kname, &retval);
3345             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3346         }
3347         for( int i = 0; i < MAX_ARRS; i++ )
3348             u[i] = 0;
3349         haveTempDstUMats = false;
3350         haveTempSrcUMats = false;
3351     }
3352
3353     void cleanupUMats()
3354     {
3355         for( int i = 0; i < MAX_ARRS; i++ )
3356             if( u[i] )
3357             {
3358                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3359                 {
3360                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3361                     u[i]->currAllocator->deallocate(u[i]);
3362                 }
3363                 u[i] = 0;
3364             }
3365         nu = 0;
3366         haveTempDstUMats = false;
3367         haveTempSrcUMats = false;
3368     }
3369
3370     void addUMat(const UMat& m, bool dst)
3371     {
3372         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3373         u[nu] = m.u;
3374         CV_XADD(&m.u->urefcount, 1);
3375         nu++;
3376         if(dst && m.u->tempUMat())
3377             haveTempDstUMats = true;
3378         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3379             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3380     }
3381
3382     void addImage(const Image2D& image)
3383     {
3384         images.push_back(image);
3385     }
3386
3387     void finit(cl_event e)
3388     {
3389         CV_UNUSED(e);
3390         cleanupUMats();
3391         images.clear();
3392         isInProgress = false;
3393         release();
3394     }
3395
3396     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3397             bool sync, int64* timeNS, const Queue& q);
3398
3399     ~Impl()
3400     {
3401         if(handle)
3402         {
3403             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3404         }
3405     }
3406
3407     IMPLEMENT_REFCOUNTABLE();
3408
3409     cv::String name;
3410     cl_kernel handle;
3411     enum { MAX_ARRS = 16 };
3412     UMatData* u[MAX_ARRS];
3413     bool isInProgress;
3414     bool isAsyncRun;  // true if kernel was scheduled in async mode
3415     int nu;
3416     std::list<Image2D> images;
3417     bool haveTempDstUMats;
3418     bool haveTempSrcUMats;
3419 };
3420
3421 }} // namespace cv::ocl
3422
3423 extern "C" {
3424
3425 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3426 {
3427     try
3428     {
3429         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3430     }
3431     catch (const cv::Exception& exc)
3432     {
3433         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3434     }
3435     catch (const std::exception& exc)
3436     {
3437         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3438     }
3439     catch (...)
3440     {
3441         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3442     }
3443 }
3444
3445 }
3446
3447 namespace cv { namespace ocl {
3448
3449 Kernel::Kernel()
3450 {
3451     p = 0;
3452 }
3453
3454 Kernel::Kernel(const char* kname, const Program& prog)
3455 {
3456     p = 0;
3457     create(kname, prog);
3458 }
3459
3460 Kernel::Kernel(const char* kname, const ProgramSource& src,
3461                const String& buildopts, String* errmsg)
3462 {
3463     p = 0;
3464     create(kname, src, buildopts, errmsg);
3465 }
3466
3467 Kernel::Kernel(const Kernel& k)
3468 {
3469     p = k.p;
3470     if(p)
3471         p->addref();
3472 }
3473
3474 Kernel& Kernel::operator = (const Kernel& k)
3475 {
3476     Impl* newp = (Impl*)k.p;
3477     if(newp)
3478         newp->addref();
3479     if(p)
3480         p->release();
3481     p = newp;
3482     return *this;
3483 }
3484
3485 Kernel::~Kernel()
3486 {
3487     if(p)
3488         p->release();
3489 }
3490
3491 bool Kernel::create(const char* kname, const Program& prog)
3492 {
3493     if(p)
3494         p->release();
3495     p = new Impl(kname, prog);
3496     if(p->handle == 0)
3497     {
3498         p->release();
3499         p = 0;
3500     }
3501 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3502     CV_Assert(p);
3503 #endif
3504     return p != 0;
3505 }
3506
3507 bool Kernel::create(const char* kname, const ProgramSource& src,
3508                     const String& buildopts, String* errmsg)
3509 {
3510     if(p)
3511     {
3512         p->release();
3513         p = 0;
3514     }
3515     String tempmsg;
3516     if( !errmsg ) errmsg = &tempmsg;
3517     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3518     return create(kname, prog);
3519 }
3520
3521 void* Kernel::ptr() const
3522 {
3523     return p ? p->handle : 0;
3524 }
3525
3526 bool Kernel::empty() const
3527 {
3528     return ptr() == 0;
3529 }
3530
3531 int Kernel::set(int i, const void* value, size_t sz)
3532 {
3533     if (!p || !p->handle)
3534         return -1;
3535     if (i < 0)
3536         return i;
3537     if( i == 0 )
3538         p->cleanupUMats();
3539
3540     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3541     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
3542     if (retval != CL_SUCCESS)
3543         return -1;
3544     return i+1;
3545 }
3546
3547 int Kernel::set(int i, const Image2D& image2D)
3548 {
3549     p->addImage(image2D);
3550     cl_mem h = (cl_mem)image2D.ptr();
3551     return set(i, &h, sizeof(h));
3552 }
3553
3554 int Kernel::set(int i, const UMat& m)
3555 {
3556     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3557 }
3558
3559 int Kernel::set(int i, const KernelArg& arg)
3560 {
3561     if( !p || !p->handle )
3562         return -1;
3563     if (i < 0)
3564     {
3565         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3566                 p->name.c_str(), (int)i));
3567         return i;
3568     }
3569     if( i == 0 )
3570         p->cleanupUMats();
3571     cl_int status = 0;
3572     if( arg.m )
3573     {
3574         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3575                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3576         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3577         if (ptronly && arg.m->empty())
3578         {
3579             cl_mem h_null = (cl_mem)NULL;
3580             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3581             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3582             return i + 1;
3583         }
3584         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3585
3586         if (!h)
3587         {
3588             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3589                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3590             p->release();
3591             p = 0;
3592             return -1;
3593         }
3594
3595 #ifdef HAVE_OPENCL_SVM
3596         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3597         {
3598             const Context& ctx = Context::getDefault();
3599             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3600             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3601             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3602 #if 1 // TODO
3603             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3604 #else
3605             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3606 #endif
3607             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3608         }
3609         else
3610 #endif
3611         {
3612             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3613             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3614         }
3615
3616         if (ptronly)
3617         {
3618             i++;
3619         }
3620         else if( arg.m->dims <= 2 )
3621         {
3622             UMat2D u2d(*arg.m);
3623             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3624             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3625             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3626             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3627             i += 3;
3628
3629             if( !(arg.flags & KernelArg::NO_SIZE) )
3630             {
3631                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3632                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3633                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3634                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3635                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3636                 i += 2;
3637             }
3638         }
3639         else
3640         {
3641             UMat3D u3d(*arg.m);
3642             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3643             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3644             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3645             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3646             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3647             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3648             i += 4;
3649             if( !(arg.flags & KernelArg::NO_SIZE) )
3650             {
3651                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3652                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3653                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3654                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3655                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3656                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3657                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3658                 i += 3;
3659             }
3660         }
3661         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3662         return i;
3663     }
3664     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3665     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3666     return i+1;
3667 }
3668
3669 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3670                  bool sync, const Queue& q)
3671 {
3672     if (!p)
3673         return false;
3674
3675     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3676     size_t total = 1;
3677     CV_Assert(_globalsize != NULL);
3678     for (int i = 0; i < dims; i++)
3679     {
3680         size_t val = _localsize ? _localsize[i] :
3681             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3682         CV_Assert( val > 0 );
3683         total *= _globalsize[i];
3684         if (_globalsize[i] == 1 && !_localsize)
3685             val = 1;
3686         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3687     }
3688     CV_Assert(total > 0);
3689
3690     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3691 }
3692
3693
3694 static bool isRaiseErrorOnReuseAsyncKernel()
3695 {
3696     static bool initialized = false;
3697     static bool value = false;
3698     if (!initialized)
3699     {
3700         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3701         initialized = true;
3702     }
3703     return value;
3704 }
3705
3706 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3707         bool sync, int64* timeNS, const Queue& q)
3708 {
3709     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3710
3711     if (!handle)
3712     {
3713         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3714         return false;
3715     }
3716
3717     if (isAsyncRun)
3718     {
3719         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3720         if (isRaiseErrorOnReuseAsyncKernel())
3721             CV_Assert(0);
3722         return false;  // OpenCV 5.0: raise error
3723     }
3724     isAsyncRun = !sync;
3725
3726     if (isInProgress)
3727     {
3728         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3729         if (isRaiseErrorOnReuseAsyncKernel())
3730             CV_Assert(0);
3731         return false;  // OpenCV 5.0: raise error
3732     }
3733
3734     cl_command_queue qq = getQueue(q);
3735     if (haveTempDstUMats)
3736         sync = true;
3737     if (haveTempSrcUMats)
3738         sync = true;
3739     if (timeNS)
3740         sync = true;
3741     cl_event asyncEvent = 0;
3742     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3743                                            NULL, globalsize, localsize, 0, 0,
3744                                            (sync && !timeNS) ? 0 : &asyncEvent);
3745 #if !CV_OPENCL_SHOW_RUN_KERNELS
3746     if (retval != CL_SUCCESS)
3747 #endif
3748     {
3749         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3750                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3751                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3752                         sync ? "true" : "false"
3753                         );
3754         if (retval != CL_SUCCESS)
3755         {
3756             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3757         }
3758 #if CV_OPENCL_TRACE_CHECK
3759         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3760 #else
3761         printf("%s\n", msg.c_str());
3762         fflush(stdout);
3763 #endif
3764     }
3765     if (sync || retval != CL_SUCCESS)
3766     {
3767         CV_OCL_DBG_CHECK(clFinish(qq));
3768         if (timeNS)
3769         {
3770             if (retval == CL_SUCCESS)
3771             {
3772                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3773                 cl_ulong startTime, stopTime;
3774                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3775                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3776                 *timeNS = (int64)(stopTime - startTime);
3777             }
3778             else
3779             {
3780                 *timeNS = -1;
3781             }
3782         }
3783         cleanupUMats();
3784     }
3785     else
3786     {
3787         addref();
3788         isInProgress = true;
3789         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3790     }
3791     if (asyncEvent)
3792         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3793     return retval == CL_SUCCESS;
3794 }
3795
3796 bool Kernel::runTask(bool sync, const Queue& q)
3797 {
3798     if(!p || !p->handle || p->isInProgress)
3799         return false;
3800
3801     cl_command_queue qq = getQueue(q);
3802     cl_event asyncEvent = 0;
3803     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3804     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3805     if (sync || retval != CL_SUCCESS)
3806     {
3807         CV_OCL_DBG_CHECK(clFinish(qq));
3808         p->cleanupUMats();
3809     }
3810     else
3811     {
3812         p->addref();
3813         p->isInProgress = true;
3814         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3815     }
3816     if (asyncEvent)
3817         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3818     return retval == CL_SUCCESS;
3819 }
3820
3821 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3822 {
3823     CV_Assert(p && p->handle && !p->isInProgress);
3824     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3825     CV_Assert(q.ptr());
3826     q.finish(); // call clFinish() on base queue
3827     Queue profilingQueue = q.getProfilingQueue();
3828     int64 timeNs = -1;
3829     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3830     return res ? timeNs : -1;
3831 }
3832
3833 size_t Kernel::workGroupSize() const
3834 {
3835     if(!p || !p->handle)
3836         return 0;
3837     size_t val = 0, retsz = 0;
3838     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3839     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3840     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3841     return status == CL_SUCCESS ? val : 0;
3842 }
3843
3844 size_t Kernel::preferedWorkGroupSizeMultiple() const
3845 {
3846     if(!p || !p->handle)
3847         return 0;
3848     size_t val = 0, retsz = 0;
3849     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3850     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3851     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3852     return status == CL_SUCCESS ? val : 0;
3853 }
3854
3855 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3856 {
3857     if(!p || !p->handle || !wsz)
3858         return 0;
3859     size_t retsz = 0;
3860     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3861     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3862     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3863     return status == CL_SUCCESS;
3864 }
3865
3866 size_t Kernel::localMemSize() const
3867 {
3868     if(!p || !p->handle)
3869         return 0;
3870     size_t retsz = 0;
3871     cl_ulong val = 0;
3872     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3873     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3874     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3875     return status == CL_SUCCESS ? (size_t)val : 0;
3876 }
3877
3878
3879
3880 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3881
3882 struct ProgramSource::Impl
3883 {
3884     IMPLEMENT_REFCOUNTABLE();
3885
3886     enum KIND {
3887         PROGRAM_SOURCE_CODE = 0,
3888         PROGRAM_BINARIES,
3889         PROGRAM_SPIR,
3890         PROGRAM_SPIRV
3891     } kind_;
3892
3893     Impl(const String& src)
3894     {
3895         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3896         initFromSource(src, cv::String());
3897     }
3898     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3899     {
3900         init(PROGRAM_SOURCE_CODE, module, name);
3901         initFromSource(codeStr, codeHash);
3902     }
3903
3904     /// reset fields
3905     void init(enum KIND kind, const String& module, const String& name)
3906     {
3907         refcount = 1;
3908         kind_ = kind;
3909         module_ = module;
3910         name_ = name;
3911
3912         sourceAddr_ = NULL;
3913         sourceSize_ = 0;
3914         isHashUpdated = false;
3915     }
3916
3917     void initFromSource(const String& codeStr, const String& codeHash)
3918     {
3919         codeStr_ = codeStr;
3920         sourceHash_ = codeHash;
3921         if (sourceHash_.empty())
3922         {
3923             updateHash();
3924         }
3925         else
3926         {
3927             isHashUpdated = true;
3928         }
3929     }
3930
3931     void updateHash(const char* hashStr = NULL)
3932     {
3933         if (hashStr)
3934         {
3935             sourceHash_ = cv::String(hashStr);
3936             isHashUpdated = true;
3937             return;
3938         }
3939         uint64 hash = 0;
3940         switch (kind_)
3941         {
3942         case PROGRAM_SOURCE_CODE:
3943             if (sourceAddr_)
3944             {
3945                 CV_Assert(codeStr_.empty());
3946                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3947             }
3948             else
3949             {
3950                 CV_Assert(!codeStr_.empty());
3951                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3952             }
3953             break;
3954         case PROGRAM_BINARIES:
3955         case PROGRAM_SPIR:
3956         case PROGRAM_SPIRV:
3957             hash = crc64(sourceAddr_, sourceSize_);
3958             break;
3959         default:
3960             CV_Error(Error::StsInternal, "Internal error");
3961         }
3962         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3963         isHashUpdated = true;
3964     }
3965
3966     Impl(enum KIND kind,
3967             const String& module, const String& name,
3968             const unsigned char* binary, const size_t size,
3969             const cv::String& buildOptions = cv::String())
3970     {
3971         init(kind, module, name);
3972
3973         sourceAddr_ = binary;
3974         sourceSize_ = size;
3975
3976         buildOptions_ = buildOptions;
3977     }
3978
3979     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3980             const char* sourceCodeStaticStr, const char* hashStaticStr,
3981             const cv::String& buildOptions)
3982     {
3983         ProgramSource result;
3984         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3985                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3986         result.p->updateHash(hashStaticStr);
3987         return result;
3988     }
3989
3990     static ProgramSource fromBinary(const String& module, const String& name,
3991             const unsigned char* binary, const size_t size,
3992             const cv::String& buildOptions)
3993     {
3994         ProgramSource result;
3995         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3996         return result;
3997     }
3998
3999     static ProgramSource fromSPIR(const String& module, const String& name,
4000             const unsigned char* binary, const size_t size,
4001             const cv::String& buildOptions)
4002     {
4003         ProgramSource result;
4004         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
4005         return result;
4006     }
4007
4008     String module_;
4009     String name_;
4010
4011     // TODO std::vector<ProgramSource> includes_;
4012     String codeStr_; // PROGRAM_SOURCE_CODE only
4013
4014     const unsigned char* sourceAddr_;
4015     size_t sourceSize_;
4016
4017     cv::String buildOptions_;
4018
4019     String sourceHash_;
4020     bool isHashUpdated;
4021
4022     friend struct Program::Impl;
4023     friend struct internal::ProgramEntry;
4024     friend struct Context::Impl;
4025 };
4026
4027
4028 ProgramSource::ProgramSource()
4029 {
4030     p = 0;
4031 }
4032
4033 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4034 {
4035     p = new Impl(module, name, codeStr, codeHash);
4036 }
4037
4038 ProgramSource::ProgramSource(const char* prog)
4039 {
4040     p = new Impl(prog);
4041 }
4042
4043 ProgramSource::ProgramSource(const String& prog)
4044 {
4045     p = new Impl(prog);
4046 }
4047
4048 ProgramSource::~ProgramSource()
4049 {
4050     if(p)
4051         p->release();
4052 }
4053
4054 ProgramSource::ProgramSource(const ProgramSource& prog)
4055 {
4056     p = prog.p;
4057     if(p)
4058         p->addref();
4059 }
4060
4061 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4062 {
4063     Impl* newp = (Impl*)prog.p;
4064     if(newp)
4065         newp->addref();
4066     if(p)
4067         p->release();
4068     p = newp;
4069     return *this;
4070 }
4071
4072 const String& ProgramSource::source() const
4073 {
4074     CV_Assert(p);
4075     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4076     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4077     return p->codeStr_;
4078 }
4079
4080 ProgramSource::hash_t ProgramSource::hash() const
4081 {
4082     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4083 }
4084
4085 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4086         const unsigned char* binary, const size_t size,
4087         const cv::String& buildOptions)
4088 {
4089     CV_Assert(binary);
4090     CV_Assert(size > 0);
4091     return Impl::fromBinary(module, name, binary, size, buildOptions);
4092 }
4093
4094 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4095         const unsigned char* binary, const size_t size,
4096         const cv::String& buildOptions)
4097 {
4098     CV_Assert(binary);
4099     CV_Assert(size > 0);
4100     return Impl::fromBinary(module, name, binary, size, buildOptions);
4101 }
4102
4103
4104 internal::ProgramEntry::operator ProgramSource&() const
4105 {
4106     if (this->pProgramSource == NULL)
4107     {
4108         cv::AutoLock lock(cv::getInitializationMutex());
4109         if (this->pProgramSource == NULL)
4110         {
4111             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4112             ProgramSource* ptr = new ProgramSource(ps);
4113             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4114         }
4115     }
4116     return *this->pProgramSource;
4117 }
4118
4119
4120
4121 /////////////////////////////////////////// Program /////////////////////////////////////////////
4122
4123 static
4124 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4125 {
4126     if (b.empty())
4127         return a;
4128     if (a.empty())
4129         return b;
4130     if (b[0] == ' ')
4131         return a + b;
4132     return a + (cv::String(" ") + b);
4133 }
4134
4135 struct Program::Impl
4136 {
4137     IMPLEMENT_REFCOUNTABLE();
4138
4139     Impl(const ProgramSource& src,
4140          const String& _buildflags, String& errmsg) :
4141          refcount(1),
4142          handle(NULL),
4143          buildflags(_buildflags)
4144     {
4145         const ProgramSource::Impl* src_ = src.getImpl();
4146         CV_Assert(src_);
4147         sourceModule_ = src_->module_;
4148         sourceName_ = src_->name_;
4149         const Context ctx = Context::getDefault();
4150         Device device = ctx.device(0);
4151         if (ctx.ptr() == NULL || device.ptr() == NULL)
4152             return;
4153         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4154         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4155         {
4156             if (device.isAMD())
4157                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4158             else if (device.isIntel())
4159                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4160             const String param_buildExtraOptions = getBuildExtraOptions();
4161             if (!param_buildExtraOptions.empty())
4162                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4163         }
4164         compile(ctx, src_, errmsg);
4165     }
4166
4167     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4168     {
4169         CV_Assert(ctx.getImpl());
4170         CV_Assert(src_);
4171
4172         // We don't cache OpenCL binaries
4173         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4174         {
4175             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4176             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4177             return isLoaded;
4178         }
4179         return compileWithCache(ctx, src_, errmsg);
4180     }
4181
4182     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4183     {
4184         CV_Assert(ctx.getImpl());
4185         CV_Assert(src_);
4186         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4187
4188 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4189         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4190         const std::string base_dir = config.prepareCacheDirectoryForContext(
4191                 ctx.getImpl()->getPrefixString(),
4192                 ctx.getImpl()->getPrefixBase()
4193         );
4194         const String& hash_str = src_->sourceHash_;
4195         cv::String fname;
4196         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4197         {
4198             CV_Assert(!hash_str.empty());
4199             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4200             fname = utils::fs::join(base_dir, fname);
4201         }
4202         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4203         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4204         {
4205             try
4206             {
4207                 std::vector<char> binaryBuf;
4208                 bool res = false;
4209                 {
4210                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4211                     BinaryProgramFile file(fname, hash_str.c_str());
4212                     res = file.read(buildflags, binaryBuf);
4213                 }
4214                 if (res)
4215                 {
4216                     CV_Assert(!binaryBuf.empty());
4217                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4218                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4219                     if (isLoaded)
4220                         return true;
4221                 }
4222             }
4223             catch (const cv::Exception& e)
4224             {
4225                 CV_UNUSED(e);
4226                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4227             }
4228             catch (...)
4229             {
4230                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4231             }
4232         }
4233 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4234         CV_Assert(handle == NULL);
4235         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4236         {
4237             if (!buildFromSources(ctx, src_, errmsg))
4238             {
4239                 return false;
4240             }
4241         }
4242         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4243         {
4244             buildflags = joinBuildOptions(buildflags, " -x spir");
4245             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4246             {
4247                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4248             }
4249             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4250             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4251             if (!isLoaded)
4252                 return false;
4253         }
4254         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4255         {
4256             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4257         }
4258         else
4259         {
4260             CV_Error(Error::StsInternal, "Internal error");
4261         }
4262         CV_Assert(handle != NULL);
4263 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4264         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4265         {
4266             try
4267             {
4268                 std::vector<char> binaryBuf;
4269                 getProgramBinary(binaryBuf);
4270                 {
4271                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4272                     BinaryProgramFile file(fname, hash_str.c_str());
4273                     file.write(buildflags, binaryBuf);
4274                 }
4275             }
4276             catch (const cv::Exception& e)
4277             {
4278                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4279             }
4280             catch (...)
4281             {
4282                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4283             }
4284         }
4285 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4286 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4287         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4288         {
4289             std::vector<char> binaryBuf;
4290             getProgramBinary(binaryBuf);
4291             if (!binaryBuf.empty())
4292             {
4293                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4294                 handle = NULL;
4295                 createFromBinary(ctx, binaryBuf, errmsg);
4296             }
4297         }
4298 #endif
4299         return handle != NULL;
4300     }
4301
4302     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4303     {
4304         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4305
4306         size_t retsz = 0;
4307         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4308                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4309         if (log_retval == CL_SUCCESS && retsz > 1)
4310         {
4311             buffer.resize(retsz + 16);
4312             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4313                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4314             if (log_retval == CL_SUCCESS)
4315             {
4316                 if (retsz < buffer.size())
4317                     buffer[retsz] = 0;
4318                 else
4319                     buffer[buffer.size() - 1] = 0;
4320             }
4321             else
4322             {
4323                 buffer[0] = 0;
4324             }
4325         }
4326
4327         errmsg = String(buffer.data());
4328         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4329                 sourceModule_.c_str(), sourceName_.c_str(),
4330                 result, getOpenCLErrorString(result),
4331                 buildflags.c_str(), errmsg.c_str());
4332         fflush(stdout);
4333     }
4334
4335     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4336     {
4337         CV_Assert(src_);
4338         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4339         CV_Assert(handle == NULL);
4340         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4341                 sourceModule_.c_str(), sourceName_.c_str(),
4342                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4343
4344         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4345
4346         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4347         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4348         CV_Assert(srcptr != NULL);
4349         CV_Assert(srclen > 0);
4350
4351         cl_int retval = 0;
4352
4353         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4354         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4355         CV_Assert(handle || retval != CL_SUCCESS);
4356         if (handle && retval == CL_SUCCESS)
4357         {
4358             size_t n = ctx.ndevices();
4359             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4360             cl_device_id* deviceList = deviceListBuf.data();
4361             for (size_t i = 0; i < n; i++)
4362             {
4363                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4364             }
4365
4366             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4367             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4368 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4369             if (retval != CL_SUCCESS)
4370 #endif
4371             {
4372                 dumpBuildLog_(retval, deviceList, errmsg);
4373
4374                 // don't remove "retval != CL_SUCCESS" condition here:
4375                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4376                 if (retval != CL_SUCCESS && handle)
4377                 {
4378                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4379                     handle = NULL;
4380                 }
4381             }
4382 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4383             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4384             {
4385                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4386                 size_t retsz = 0;
4387                 char kernels_buffer[4096] = {0};
4388                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4389                 if (retsz < sizeof(kernels_buffer))
4390                     kernels_buffer[retsz] = 0;
4391                 else
4392                     kernels_buffer[0] = 0;
4393                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4394             }
4395 #endif
4396
4397         }
4398         return handle != NULL;
4399     }
4400
4401     void getProgramBinary(std::vector<char>& buf)
4402     {
4403         CV_Assert(handle);
4404         size_t sz = 0;
4405         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4406         buf.resize(sz);
4407         uchar* ptr = (uchar*)&buf[0];
4408         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4409     }
4410
4411     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4412     {
4413         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4414     }
4415
4416     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4417     {
4418         CV_Assert(handle == NULL);
4419         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4420         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4421
4422         CV_Assert(binarySize > 0);
4423
4424         size_t ndevices = (int)ctx.ndevices();
4425         AutoBuffer<cl_device_id> devices_(ndevices);
4426         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4427         AutoBuffer<size_t> binarySizes_(ndevices);
4428
4429         cl_device_id* devices = devices_.data();
4430         const uchar** binaryPtrs = binaryPtrs_.data();
4431         size_t* binarySizes = binarySizes_.data();
4432         for (size_t i = 0; i < ndevices; i++)
4433         {
4434             devices[i] = (cl_device_id)ctx.device(i).ptr();
4435             binaryPtrs[i] = binaryAddr;
4436             binarySizes[i] = binarySize;
4437         }
4438
4439         cl_int result = 0;
4440         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4441                                            binarySizes, binaryPtrs, NULL, &result);
4442         if (result != CL_SUCCESS)
4443         {
4444             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4445             if (handle)
4446             {
4447                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4448                 handle = NULL;
4449             }
4450         }
4451         if (!handle)
4452         {
4453             return false;
4454         }
4455         // call clBuildProgram()
4456         {
4457             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4458             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4459             if (result != CL_SUCCESS)
4460             {
4461                 dumpBuildLog_(result, devices, errmsg);
4462                 if (handle)
4463                 {
4464                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4465                     handle = NULL;
4466                 }
4467                 return false;
4468             }
4469         }
4470         // check build status
4471         {
4472             cl_build_status build_status = CL_BUILD_NONE;
4473             size_t retsz = 0;
4474             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4475                     sizeof(build_status), &build_status, &retsz));
4476             if (result == CL_SUCCESS)
4477             {
4478                 if (build_status == CL_BUILD_SUCCESS)
4479                 {
4480                     return true;
4481                 }
4482                 else
4483                 {
4484                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4485                     return false;
4486                 }
4487             }
4488             else
4489             {
4490                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4491                 if (handle)
4492                 {
4493                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4494                     handle = NULL;
4495                 }
4496             }
4497         }
4498 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4499         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4500         {
4501             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4502             size_t retsz = 0;
4503             char kernels_buffer[4096] = {0};
4504             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4505             if (retsz < sizeof(kernels_buffer))
4506                 kernels_buffer[retsz] = 0;
4507             else
4508                 kernels_buffer[0] = 0;
4509             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4510         }
4511 #endif
4512         return handle != NULL;
4513     }
4514
4515     ~Impl()
4516     {
4517         if( handle )
4518         {
4519 #ifdef _WIN32
4520             if (!cv::__termination)
4521 #endif
4522             {
4523                 clReleaseProgram(handle);
4524             }
4525             handle = NULL;
4526         }
4527     }
4528
4529     cl_program handle;
4530
4531     String buildflags;
4532     String sourceModule_;
4533     String sourceName_;
4534 };
4535
4536
4537 Program::Program() { p = 0; }
4538
4539 Program::Program(const ProgramSource& src,
4540         const String& buildflags, String& errmsg)
4541 {
4542     p = 0;
4543     create(src, buildflags, errmsg);
4544 }
4545
4546 Program::Program(const Program& prog)
4547 {
4548     p = prog.p;
4549     if(p)
4550         p->addref();
4551 }
4552
4553 Program& Program::operator = (const Program& prog)
4554 {
4555     Impl* newp = (Impl*)prog.p;
4556     if(newp)
4557         newp->addref();
4558     if(p)
4559         p->release();
4560     p = newp;
4561     return *this;
4562 }
4563
4564 Program::~Program()
4565 {
4566     if(p)
4567         p->release();
4568 }
4569
4570 bool Program::create(const ProgramSource& src,
4571             const String& buildflags, String& errmsg)
4572 {
4573     if(p)
4574     {
4575         p->release();
4576         p = NULL;
4577     }
4578     p = new Impl(src, buildflags, errmsg);
4579     if(!p->handle)
4580     {
4581         p->release();
4582         p = 0;
4583     }
4584     return p != 0;
4585 }
4586
4587 void* Program::ptr() const
4588 {
4589     return p ? p->handle : 0;
4590 }
4591
4592 #ifndef OPENCV_REMOVE_DEPRECATED_API
4593 const ProgramSource& Program::source() const
4594 {
4595     CV_Error(Error::StsNotImplemented, "Removed API");
4596 }
4597
4598 bool Program::read(const String& bin, const String& buildflags)
4599 {
4600     CV_UNUSED(bin); CV_UNUSED(buildflags);
4601     CV_Error(Error::StsNotImplemented, "Removed API");
4602 }
4603
4604 bool Program::write(String& bin) const
4605 {
4606     CV_UNUSED(bin);
4607     CV_Error(Error::StsNotImplemented, "Removed API");
4608 }
4609
4610 String Program::getPrefix() const
4611 {
4612     if(!p)
4613         return String();
4614     Context::Impl* ctx_ = Context::getDefault().getImpl();
4615     CV_Assert(ctx_);
4616     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4617 }
4618
4619 String Program::getPrefix(const String& buildflags)
4620 {
4621         Context::Impl* ctx_ = Context::getDefault().getImpl();
4622         CV_Assert(ctx_);
4623         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4624 }
4625 #endif // OPENCV_REMOVE_DEPRECATED_API
4626
4627 void Program::getBinary(std::vector<char>& binary) const
4628 {
4629     CV_Assert(p && "Empty program");
4630     p->getProgramBinary(binary);
4631 }
4632
4633 Program Context::Impl::getProg(const ProgramSource& src,
4634                                const String& buildflags, String& errmsg)
4635 {
4636     size_t limit = getProgramCountLimit();
4637     const ProgramSource::Impl* src_ = src.getImpl();
4638     CV_Assert(src_);
4639     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4640             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4641             getPrefixString().c_str(),
4642             buildflags.c_str());
4643     {
4644         cv::AutoLock lock(program_cache_mutex);
4645         phash_t::iterator it = phash.find(key);
4646         if (it != phash.end())
4647         {
4648             // TODO LRU cache
4649             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4650             if (i != cacheList.end() && i != cacheList.begin())
4651             {
4652                 cacheList.erase(i);
4653                 cacheList.push_front(key);
4654             }
4655             return it->second;
4656         }
4657         { // cleanup program cache
4658             size_t sz = phash.size();
4659             if (limit > 0 && sz >= limit)
4660             {
4661                 static bool warningFlag = false;
4662                 if (!warningFlag)
4663                 {
4664                     printf("\nWARNING: OpenCV-OpenCL:\n"
4665                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4666                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4667                     warningFlag = true;
4668                 }
4669                 while (!cacheList.empty())
4670                 {
4671                     size_t c = phash.erase(cacheList.back());
4672                     cacheList.pop_back();
4673                     if (c != 0)
4674                         break;
4675                 }
4676             }
4677         }
4678     }
4679     Program prog(src, buildflags, errmsg);
4680     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4681     {
4682         cv::AutoLock lock(program_cache_mutex);
4683         phash.insert(std::pair<std::string, Program>(key, prog));
4684         cacheList.push_front(key);
4685     }
4686     return prog;
4687 }
4688
4689
4690 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4691
4692 template<typename T>
4693 class OpenCLBufferPool
4694 {
4695 protected:
4696     ~OpenCLBufferPool() { }
4697 public:
4698     virtual T allocate(size_t size) = 0;
4699     virtual void release(T buffer) = 0;
4700 };
4701
4702 template <typename Derived, typename BufferEntry, typename T>
4703 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4704 {
4705 private:
4706     inline Derived& derived() { return *static_cast<Derived*>(this); }
4707 protected:
4708     Mutex mutex_;
4709
4710     size_t currentReservedSize;
4711     size_t maxReservedSize;
4712
4713     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4714     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4715
4716     // synchronized
4717     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4718     {
4719         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4720         for (; i != allocatedEntries_.end(); ++i)
4721         {
4722             BufferEntry& e = *i;
4723             if (e.clBuffer_ == buffer)
4724             {
4725                 entry = e;
4726                 allocatedEntries_.erase(i);
4727                 return true;
4728             }
4729         }
4730         return false;
4731     }
4732
4733     // synchronized
4734     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4735     {
4736         if (reservedEntries_.empty())
4737             return false;
4738         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4739         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4740         BufferEntry result;
4741         size_t minDiff = (size_t)(-1);
4742         for (; i != reservedEntries_.end(); ++i)
4743         {
4744             BufferEntry& e = *i;
4745             if (e.capacity_ >= size)
4746             {
4747                 size_t diff = e.capacity_ - size;
4748                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4749                 {
4750                     minDiff = diff;
4751                     result_pos = i;
4752                     result = e;
4753                     if (diff == 0)
4754                         break;
4755                 }
4756             }
4757         }
4758         if (result_pos != reservedEntries_.end())
4759         {
4760             //CV_DbgAssert(result == *result_pos);
4761             reservedEntries_.erase(result_pos);
4762             entry = result;
4763             currentReservedSize -= entry.capacity_;
4764             allocatedEntries_.push_back(entry);
4765             return true;
4766         }
4767         return false;
4768     }
4769
4770     // synchronized
4771     void _checkSizeOfReservedEntries()
4772     {
4773         while (currentReservedSize > maxReservedSize)
4774         {
4775             CV_DbgAssert(!reservedEntries_.empty());
4776             const BufferEntry& entry = reservedEntries_.back();
4777             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4778             currentReservedSize -= entry.capacity_;
4779             derived()._releaseBufferEntry(entry);
4780             reservedEntries_.pop_back();
4781         }
4782     }
4783
4784     inline size_t _allocationGranularity(size_t size)
4785     {
4786         // heuristic values
4787         if (size < 1024*1024)
4788             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4789         else if (size < 16*1024*1024)
4790             return 64*1024;
4791         else
4792             return 1024*1024;
4793     }
4794
4795 public:
4796     OpenCLBufferPoolBaseImpl()
4797         : currentReservedSize(0),
4798           maxReservedSize(0)
4799     {
4800         // nothing
4801     }
4802     virtual ~OpenCLBufferPoolBaseImpl()
4803     {
4804         freeAllReservedBuffers();
4805         CV_Assert(reservedEntries_.empty());
4806     }
4807 public:
4808     virtual T allocate(size_t size) CV_OVERRIDE
4809     {
4810         AutoLock locker(mutex_);
4811         BufferEntry entry;
4812         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4813         {
4814             CV_DbgAssert(size <= entry.capacity_);
4815             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4816         }
4817         else
4818         {
4819             derived()._allocateBufferEntry(entry, size);
4820         }
4821         return entry.clBuffer_;
4822     }
4823     virtual void release(T buffer) CV_OVERRIDE
4824     {
4825         AutoLock locker(mutex_);
4826         BufferEntry entry;
4827         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4828         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4829         {
4830             derived()._releaseBufferEntry(entry);
4831         }
4832         else
4833         {
4834             reservedEntries_.push_front(entry);
4835             currentReservedSize += entry.capacity_;
4836             _checkSizeOfReservedEntries();
4837         }
4838     }
4839
4840     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4841     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4842     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4843     {
4844         AutoLock locker(mutex_);
4845         size_t oldMaxReservedSize = maxReservedSize;
4846         maxReservedSize = size;
4847         if (maxReservedSize < oldMaxReservedSize)
4848         {
4849             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4850             for (; i != reservedEntries_.end();)
4851             {
4852                 const BufferEntry& entry = *i;
4853                 if (entry.capacity_ > maxReservedSize / 8)
4854                 {
4855                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4856                     currentReservedSize -= entry.capacity_;
4857                     derived()._releaseBufferEntry(entry);
4858                     i = reservedEntries_.erase(i);
4859                     continue;
4860                 }
4861                 ++i;
4862             }
4863             _checkSizeOfReservedEntries();
4864         }
4865     }
4866     virtual void freeAllReservedBuffers() CV_OVERRIDE
4867     {
4868         AutoLock locker(mutex_);
4869         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4870         for (; i != reservedEntries_.end(); ++i)
4871         {
4872             const BufferEntry& entry = *i;
4873             derived()._releaseBufferEntry(entry);
4874         }
4875         reservedEntries_.clear();
4876         currentReservedSize = 0;
4877     }
4878 };
4879
4880 struct CLBufferEntry
4881 {
4882     cl_mem clBuffer_;
4883     size_t capacity_;
4884     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4885 };
4886
4887 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4888 {
4889 public:
4890     typedef struct CLBufferEntry BufferEntry;
4891 protected:
4892     int createFlags_;
4893 public:
4894     OpenCLBufferPoolImpl(int createFlags = 0)
4895         : createFlags_(createFlags)
4896     {
4897     }
4898
4899     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4900     {
4901         CV_DbgAssert(entry.clBuffer_ == NULL);
4902         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4903         Context& ctx = Context::getDefault();
4904         cl_int retval = CL_SUCCESS;
4905         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4906         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4907         CV_Assert(entry.clBuffer_ != NULL);
4908         if(retval == CL_SUCCESS)
4909         {
4910             CV_IMPL_ADD(CV_IMPL_OCL);
4911         }
4912         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4913                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4914         allocatedEntries_.push_back(entry);
4915     }
4916
4917     void _releaseBufferEntry(const BufferEntry& entry)
4918     {
4919         CV_Assert(entry.capacity_ != 0);
4920         CV_Assert(entry.clBuffer_ != NULL);
4921         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4922                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4923         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4924     }
4925 };
4926
4927 #ifdef HAVE_OPENCL_SVM
4928 struct CLSVMBufferEntry
4929 {
4930     void* clBuffer_;
4931     size_t capacity_;
4932     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4933 };
4934 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4935 {
4936 public:
4937     typedef struct CLSVMBufferEntry BufferEntry;
4938 public:
4939     OpenCLSVMBufferPoolImpl()
4940     {
4941     }
4942
4943     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4944     {
4945         CV_DbgAssert(entry.clBuffer_ == NULL);
4946         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4947
4948         Context& ctx = Context::getDefault();
4949         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4950         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4951         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4952                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4953
4954         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4955         CV_DbgAssert(svmFns->isValid());
4956
4957         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4958         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4959         CV_Assert(buf);
4960
4961         entry.clBuffer_ = buf;
4962         {
4963             CV_IMPL_ADD(CV_IMPL_OCL);
4964         }
4965         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4966                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4967         allocatedEntries_.push_back(entry);
4968     }
4969
4970     void _releaseBufferEntry(const BufferEntry& entry)
4971     {
4972         CV_Assert(entry.capacity_ != 0);
4973         CV_Assert(entry.clBuffer_ != NULL);
4974         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4975                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4976         Context& ctx = Context::getDefault();
4977         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4978         CV_DbgAssert(svmFns->isValid());
4979         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4980         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4981     }
4982 };
4983 #endif
4984
4985
4986
4987 template <bool readAccess, bool writeAccess>
4988 class AlignedDataPtr
4989 {
4990 protected:
4991     const size_t size_;
4992     uchar* const originPtr_;
4993     const size_t alignment_;
4994     uchar* ptr_;
4995     uchar* allocatedPtr_;
4996
4997 public:
4998     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4999         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
5000     {
5001         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5002         CV_DbgAssert(!readAccess || ptr);
5003         if (((size_t)ptr_ & (alignment - 1)) != 0)
5004         {
5005             allocatedPtr_ = new uchar[size_ + alignment - 1];
5006             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5007             if (readAccess)
5008             {
5009                 memcpy(ptr_, originPtr_, size_);
5010             }
5011         }
5012     }
5013
5014     uchar* getAlignedPtr() const
5015     {
5016         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5017         return ptr_;
5018     }
5019
5020     ~AlignedDataPtr()
5021     {
5022         if (allocatedPtr_)
5023         {
5024             if (writeAccess)
5025             {
5026                 memcpy(originPtr_, ptr_, size_);
5027             }
5028             delete[] allocatedPtr_;
5029             allocatedPtr_ = NULL;
5030         }
5031         ptr_ = NULL;
5032     }
5033 private:
5034     AlignedDataPtr(const AlignedDataPtr&); // disabled
5035     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5036 };
5037
5038 template <bool readAccess, bool writeAccess>
5039 class AlignedDataPtr2D
5040 {
5041 protected:
5042     const size_t size_;
5043     uchar* const originPtr_;
5044     const size_t alignment_;
5045     uchar* ptr_;
5046     uchar* allocatedPtr_;
5047     size_t rows_;
5048     size_t cols_;
5049     size_t step_;
5050
5051 public:
5052     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5053         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5054     {
5055         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5056         CV_DbgAssert(!readAccess || ptr != NULL);
5057         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5058         {
5059             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5060             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5061             if (readAccess)
5062             {
5063                 for (size_t i = 0; i < rows_; i++)
5064                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5065             }
5066         }
5067     }
5068
5069     uchar* getAlignedPtr() const
5070     {
5071         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5072         return ptr_;
5073     }
5074
5075     ~AlignedDataPtr2D()
5076     {
5077         if (allocatedPtr_)
5078         {
5079             if (writeAccess)
5080             {
5081                 for (size_t i = 0; i < rows_; i++)
5082                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5083             }
5084             delete[] allocatedPtr_;
5085             allocatedPtr_ = NULL;
5086         }
5087         ptr_ = NULL;
5088     }
5089 private:
5090     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5091     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5092 };
5093
5094 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5095 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5096 #endif
5097
5098
5099 void Context::Impl::__init_buffer_pools()
5100 {
5101     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5102     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5103     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5104     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5105
5106     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5107     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5108     bufferPool.setMaxReservedSize(poolSize);
5109     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5110     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5111
5112 #ifdef HAVE_OPENCL_SVM
5113     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5114     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5115     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5116     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5117 #endif
5118
5119     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5120 }
5121
5122 class OpenCLAllocator CV_FINAL : public MatAllocator
5123 {
5124 public:
5125     enum AllocatorFlags
5126     {
5127         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5128         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5129 #ifdef HAVE_OPENCL_SVM
5130         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5131 #endif
5132         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5133     };
5134
5135     OpenCLAllocator()
5136     {
5137         matStdAllocator = Mat::getDefaultAllocator();
5138     }
5139     ~OpenCLAllocator()
5140     {
5141         flushCleanupQueue();
5142     }
5143
5144     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5145             AccessFlag flags, UMatUsageFlags usageFlags) const
5146     {
5147         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5148         return u;
5149     }
5150
5151     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5152     {
5153         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5154         return value;
5155     }
5156     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5157     {
5158         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5159         return value;
5160     }
5161
5162     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5163     {
5164         const Device& dev = ctx.device(0);
5165         createFlags = 0;
5166         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5167             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5168
5169         if (!isOpenCLCopyingForced() &&
5170             (isOpenCLMapForced() ||
5171                 (dev.hostUnifiedMemory()
5172 #ifndef __APPLE__
5173                 || dev.isIntel()
5174 #endif
5175                 )
5176             )
5177         )
5178             flags0 = static_cast<UMatData::MemoryFlag>(0);
5179         else
5180             flags0 = UMatData::COPY_ON_MAP;
5181     }
5182
5183     UMatData* allocate(int dims, const int* sizes, int type,
5184                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5185     {
5186         if(!useOpenCL())
5187             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5188
5189         flushCleanupQueue();
5190
5191         CV_Assert(data == 0);
5192         size_t total = CV_ELEM_SIZE(type);
5193         for( int i = dims-1; i >= 0; i-- )
5194         {
5195             if( step )
5196                 step[i] = total;
5197             total *= sizes[i];
5198         }
5199
5200         Context& ctx = Context::getDefault();
5201         if (!ctx.getImpl())
5202             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5203         Context::Impl& ctxImpl = *ctx.getImpl();
5204
5205         int createFlags = 0;
5206         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5207         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5208
5209         void* handle = NULL;
5210         int allocatorFlags = 0;
5211
5212 #ifdef HAVE_OPENCL_SVM
5213         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5214         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5215         {
5216             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5217             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5218
5219             // this property is constant, so single buffer pool can be used here
5220             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5221             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5222         }
5223         else
5224 #endif
5225         if (createFlags == 0)
5226         {
5227             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5228             handle = ctxImpl.getBufferPool().allocate(total);
5229         }
5230         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5231         {
5232             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5233             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5234         }
5235         else
5236         {
5237             CV_Assert(handle != NULL); // Unsupported, throw
5238         }
5239
5240         if (!handle)
5241             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5242
5243         UMatData* u = new UMatData(this);
5244         u->data = 0;
5245         u->size = total;
5246         u->handle = handle;
5247         u->flags = flags0;
5248         u->allocatorFlags_ = allocatorFlags;
5249         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5250         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5251         u->markHostCopyObsolete(true);
5252         opencl_allocator_stats.onAllocate(u->size);
5253         return u;
5254     }
5255
5256     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5257     {
5258         if(!u)
5259             return false;
5260
5261         flushCleanupQueue();
5262
5263         UMatDataAutoLock lock(u);
5264
5265         if(u->handle == 0)
5266         {
5267             CV_Assert(u->origdata != 0);
5268             Context& ctx = Context::getDefault();
5269             int createFlags = 0;
5270             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5271             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5272
5273             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5274
5275             cl_context ctx_handle = (cl_context)ctx.ptr();
5276             int allocatorFlags = 0;
5277             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5278             void* handle = NULL;
5279             cl_int retval = CL_SUCCESS;
5280
5281 #ifdef HAVE_OPENCL_SVM
5282             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5283             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5284             if (useSVM && svmCaps.isSupportFineGrainSystem())
5285             {
5286                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5287                 tempUMatFlags = UMatData::TEMP_UMAT;
5288                 handle = u->origdata;
5289                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5290             }
5291             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5292             {
5293                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5294                 {
5295                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5296
5297                     cl_svm_mem_flags memFlags = createFlags |
5298                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5299
5300                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5301                     CV_DbgAssert(svmFns->isValid());
5302
5303                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5304                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5305                     CV_Assert(handle);
5306
5307                     cl_command_queue q = NULL;
5308                     if (!isFineGrainBuffer)
5309                     {
5310                         q = (cl_command_queue)Queue::getDefault().ptr();
5311                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5312                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5313                                 handle, u->size,
5314                                 0, NULL, NULL);
5315                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5316
5317                     }
5318                     memcpy(handle, u->origdata, u->size);
5319                     if (!isFineGrainBuffer)
5320                     {
5321                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5322                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5323                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5324                     }
5325
5326                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5327                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5328                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5329                 }
5330             }
5331             else
5332 #endif
5333             {
5334                 if( copyOnMap )
5335                     accessFlags &= ~ACCESS_FAST;
5336
5337                 tempUMatFlags = UMatData::TEMP_UMAT;
5338                 if (
5339                 #ifdef __APPLE__
5340                     !copyOnMap &&
5341                 #endif
5342                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5343                     // There are OpenCL runtime issues for less aligned data
5344                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5345                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5346                     // Avoid sharing of host memory between OpenCL buffers
5347                     && !(u->originalUMatData && u->originalUMatData->handle)
5348                 )
5349                 {
5350                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5351                                             u->size, u->origdata, &retval);
5352                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5353                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5354                 }
5355                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5356                 {
5357                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5358                                                u->size, u->origdata, &retval);
5359                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5360                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5361                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5362                 }
5363             }
5364             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5365             if(!handle || retval != CL_SUCCESS)
5366                 return false;
5367             u->handle = handle;
5368             u->prevAllocator = u->currAllocator;
5369             u->currAllocator = this;
5370             u->flags |= tempUMatFlags | flags0;
5371             u->allocatorFlags_ = allocatorFlags;
5372         }
5373         if (!!(accessFlags & ACCESS_WRITE))
5374             u->markHostCopyObsolete(true);
5375         opencl_allocator_stats.onAllocate(u->size);
5376         return true;
5377     }
5378
5379     /*void sync(UMatData* u) const
5380     {
5381         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5382         UMatDataAutoLock lock(u);
5383
5384         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5385         {
5386             if( u->tempCopiedUMat() )
5387             {
5388                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5389                                     u->size, u->origdata, 0, 0, 0);
5390             }
5391             else
5392             {
5393                 cl_int retval = 0;
5394                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5395                                                 (CL_MAP_READ | CL_MAP_WRITE),
5396                                                 0, u->size, 0, 0, 0, &retval);
5397                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5398                 clFinish(q);
5399             }
5400             u->markHostCopyObsolete(false);
5401         }
5402         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5403         {
5404             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5405                                  u->size, u->data, 0, 0, 0);
5406         }
5407     }*/
5408
5409     void deallocate(UMatData* u) const CV_OVERRIDE
5410     {
5411         if(!u)
5412             return;
5413
5414         CV_Assert(u->urefcount == 0);
5415         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5416
5417         CV_Assert(u->handle != 0);
5418         CV_Assert(u->mapcount == 0);
5419
5420         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5421             addToCleanupQueue(u);
5422         else
5423             deallocate_(u);
5424     }
5425
5426     void deallocate_(UMatData* u) const
5427     {
5428         CV_Assert(u);
5429         CV_Assert(u->handle);
5430         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5431         {
5432             opencl_allocator_stats.onFree(u->size);
5433         }
5434
5435 #ifdef _WIN32
5436         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5437             return;             // avoid any OpenCL calls
5438 #endif
5439         if(u->tempUMat())
5440         {
5441             CV_Assert(u->origdata);
5442 //            UMatDataAutoLock lock(u);
5443
5444             if (u->hostCopyObsolete())
5445             {
5446 #ifdef HAVE_OPENCL_SVM
5447                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5448                 {
5449                     Context& ctx = Context::getDefault();
5450                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5451                     CV_DbgAssert(svmFns->isValid());
5452
5453                     if( u->tempCopiedUMat() )
5454                     {
5455                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5456                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5457                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5458                         cl_command_queue q = NULL;
5459                         if (!isFineGrainBuffer)
5460                         {
5461                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5462                             q = (cl_command_queue)Queue::getDefault().ptr();
5463                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5464                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5465                                     u->handle, u->size,
5466                                     0, NULL, NULL);
5467                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5468                         }
5469                         clFinish(q);
5470                         memcpy(u->origdata, u->handle, u->size);
5471                         if (!isFineGrainBuffer)
5472                         {
5473                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5474                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5475                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5476                         }
5477                     }
5478                     else
5479                     {
5480                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5481                         // nothing
5482                     }
5483                 }
5484                 else
5485 #endif
5486                 {
5487                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5488                     if( u->tempCopiedUMat() )
5489                     {
5490                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5491                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5492                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5493                     }
5494                     else
5495                     {
5496                         cl_int retval = 0;
5497                         if (u->tempUMat())
5498                         {
5499                             CV_Assert(u->mapcount == 0);
5500                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5501                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5502                                 (CL_MAP_READ | CL_MAP_WRITE),
5503                                 0, u->size, 0, 0, 0, &retval);
5504                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5505                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5506                             if (u->originalUMatData)
5507                             {
5508                                 CV_Assert(u->originalUMatData->data == data);
5509                             }
5510                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5511                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5512                             CV_OCL_DBG_CHECK(clFinish(q));
5513                         }
5514                     }
5515                 }
5516                 u->markHostCopyObsolete(false);
5517             }
5518             else
5519             {
5520                 // nothing
5521             }
5522 #ifdef HAVE_OPENCL_SVM
5523             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5524             {
5525                 if( u->tempCopiedUMat() )
5526                 {
5527                     Context& ctx = Context::getDefault();
5528                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5529                     CV_DbgAssert(svmFns->isValid());
5530
5531                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5532                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5533                 }
5534             }
5535             else
5536 #endif
5537             {
5538                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5539                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5540             }
5541             u->handle = 0;
5542             u->markDeviceCopyObsolete(true);
5543             u->currAllocator = u->prevAllocator;
5544             u->prevAllocator = NULL;
5545             if(u->data && u->copyOnMap() && u->data != u->origdata)
5546                 fastFree(u->data);
5547             u->data = u->origdata;
5548             u->currAllocator->deallocate(u);
5549             u = NULL;
5550         }
5551         else
5552         {
5553             CV_Assert(u->origdata == NULL);
5554             if(u->data && u->copyOnMap() && u->data != u->origdata)
5555             {
5556                 fastFree(u->data);
5557                 u->data = 0;
5558                 u->markHostCopyObsolete(true);
5559             }
5560             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5561             {
5562                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5563                 CV_Assert(pCtx);
5564                 ocl::Context& ctx = *pCtx.get();
5565                 CV_Assert(ctx.getImpl());
5566                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5567             }
5568             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5569             {
5570                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5571                 CV_Assert(pCtx);
5572                 ocl::Context& ctx = *pCtx.get();
5573                 CV_Assert(ctx.getImpl());
5574                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5575             }
5576 #ifdef HAVE_OPENCL_SVM
5577             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5578             {
5579                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5580                 CV_Assert(pCtx);
5581                 ocl::Context& ctx = *pCtx.get();
5582                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5583                 {
5584                     //nothing
5585                 }
5586                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5587                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5588                 {
5589                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5590                     CV_DbgAssert(svmFns->isValid());
5591                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5592
5593                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5594                     {
5595                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5596                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5597                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5598                     }
5599                 }
5600                 CV_Assert(ctx.getImpl());
5601                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5602             }
5603 #endif
5604             else
5605             {
5606                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5607             }
5608             u->handle = 0;
5609             u->markDeviceCopyObsolete(true);
5610             delete u;
5611             u = NULL;
5612         }
5613         CV_Assert(u == NULL);
5614     }
5615
5616     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5617     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5618     {
5619         CV_Assert(u && u->handle);
5620
5621         if (!!(accessFlags & ACCESS_WRITE))
5622             u->markDeviceCopyObsolete(true);
5623
5624         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5625
5626         {
5627             if( !u->copyOnMap() )
5628             {
5629                 // TODO
5630                 // because there can be other map requests for the same UMat with different access flags,
5631                 // we use the universal (read-write) access mode.
5632 #ifdef HAVE_OPENCL_SVM
5633                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5634                 {
5635                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5636                     {
5637                         Context& ctx = Context::getDefault();
5638                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5639                         CV_DbgAssert(svmFns->isValid());
5640
5641                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5642                         {
5643                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5644                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5645                                     u->handle, u->size,
5646                                     0, NULL, NULL);
5647                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5648                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5649                         }
5650                     }
5651                     clFinish(q);
5652                     u->data = (uchar*)u->handle;
5653                     u->markHostCopyObsolete(false);
5654                     u->markDeviceMemMapped(true);
5655                     return;
5656                 }
5657 #endif
5658
5659                 cl_int retval = CL_SUCCESS;
5660                 if (!u->deviceMemMapped())
5661                 {
5662                     CV_Assert(u->refcount == 1);
5663                     CV_Assert(u->mapcount++ == 0);
5664                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5665                                                          (CL_MAP_READ | CL_MAP_WRITE),
5666                                                          0, u->size, 0, 0, 0, &retval);
5667                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5668                 }
5669                 if (u->data && retval == CL_SUCCESS)
5670                 {
5671                     u->markHostCopyObsolete(false);
5672                     u->markDeviceMemMapped(true);
5673                     return;
5674                 }
5675
5676                 // TODO Is it really a good idea and was it tested well?
5677                 // if map failed, switch to copy-on-map mode for the particular buffer
5678                 u->flags |= UMatData::COPY_ON_MAP;
5679             }
5680
5681             if(!u->data)
5682             {
5683                 u->data = (uchar*)fastMalloc(u->size);
5684                 u->markHostCopyObsolete(true);
5685             }
5686         }
5687
5688         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5689         {
5690             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5691 #ifdef HAVE_OPENCL_SVM
5692             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5693 #endif
5694             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5695                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5696             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5697                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5698             u->markHostCopyObsolete(false);
5699         }
5700     }
5701
5702     void unmap(UMatData* u) const CV_OVERRIDE
5703     {
5704         if(!u)
5705             return;
5706
5707
5708         CV_Assert(u->handle != 0);
5709
5710         UMatDataAutoLock autolock(u);
5711
5712         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5713         cl_int retval = 0;
5714         if( !u->copyOnMap() && u->deviceMemMapped() )
5715         {
5716             CV_Assert(u->data != NULL);
5717 #ifdef HAVE_OPENCL_SVM
5718             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5719             {
5720                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5721                 {
5722                     Context& ctx = Context::getDefault();
5723                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5724                     CV_DbgAssert(svmFns->isValid());
5725
5726                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5727                     {
5728                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5729                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5730                                 0, NULL, NULL);
5731                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5732                         clFinish(q);
5733                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5734                     }
5735                 }
5736                 if (u->refcount == 0)
5737                     u->data = 0;
5738                 u->markDeviceCopyObsolete(false);
5739                 u->markHostCopyObsolete(true);
5740                 return;
5741             }
5742 #endif
5743             if (u->refcount == 0)
5744             {
5745                 CV_Assert(u->mapcount-- == 1);
5746                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5747                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5748                 if (Device::getDefault().isAMD())
5749                 {
5750                     // required for multithreaded applications (see stitching test)
5751                     CV_OCL_DBG_CHECK(clFinish(q));
5752                 }
5753                 u->markDeviceMemMapped(false);
5754                 u->data = 0;
5755                 u->markDeviceCopyObsolete(false);
5756                 u->markHostCopyObsolete(true);
5757             }
5758         }
5759         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5760         {
5761             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5762 #ifdef HAVE_OPENCL_SVM
5763             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5764 #endif
5765             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5766                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5767             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5768                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5769             u->markDeviceCopyObsolete(false);
5770             u->markHostCopyObsolete(true);
5771         }
5772     }
5773
5774     bool checkContinuous(int dims, const size_t sz[],
5775                          const size_t srcofs[], const size_t srcstep[],
5776                          const size_t dstofs[], const size_t dststep[],
5777                          size_t& total, size_t new_sz[],
5778                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5779                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5780     {
5781         bool iscontinuous = true;
5782         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5783         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5784         total = sz[dims-1];
5785         for( int i = dims-2; i >= 0; i-- )
5786         {
5787             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5788                 iscontinuous = false;
5789             total *= sz[i];
5790             if( srcofs )
5791                 srcrawofs += srcofs[i]*srcstep[i];
5792             if( dstofs )
5793                 dstrawofs += dstofs[i]*dststep[i];
5794         }
5795
5796         if( !iscontinuous )
5797         {
5798             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5799             if( dims == 2 )
5800             {
5801                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5802                 // we assume that new_... arrays are initialized by caller
5803                 // with 0's, so there is no else branch
5804                 if( srcofs )
5805                 {
5806                     new_srcofs[0] = srcofs[1];
5807                     new_srcofs[1] = srcofs[0];
5808                     new_srcofs[2] = 0;
5809                 }
5810
5811                 if( dstofs )
5812                 {
5813                     new_dstofs[0] = dstofs[1];
5814                     new_dstofs[1] = dstofs[0];
5815                     new_dstofs[2] = 0;
5816                 }
5817
5818                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5819                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5820             }
5821             else
5822             {
5823                 // we could check for dims == 3 here,
5824                 // but from user perspective this one is more informative
5825                 CV_Assert(dims <= 3);
5826                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5827                 if( srcofs )
5828                 {
5829                     new_srcofs[0] = srcofs[2];
5830                     new_srcofs[1] = srcofs[1];
5831                     new_srcofs[2] = srcofs[0];
5832                 }
5833
5834                 if( dstofs )
5835                 {
5836                     new_dstofs[0] = dstofs[2];
5837                     new_dstofs[1] = dstofs[1];
5838                     new_dstofs[2] = dstofs[0];
5839                 }
5840
5841                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5842                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5843             }
5844         }
5845         return iscontinuous;
5846     }
5847
5848     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5849                   const size_t srcofs[], const size_t srcstep[],
5850                   const size_t dststep[]) const CV_OVERRIDE
5851     {
5852         if(!u)
5853             return;
5854         UMatDataAutoLock autolock(u);
5855
5856         if( u->data && !u->hostCopyObsolete() )
5857         {
5858             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5859             return;
5860         }
5861         CV_Assert( u->handle != 0 );
5862
5863         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5864
5865         size_t total = 0, new_sz[] = {0, 0, 0};
5866         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5867         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5868
5869         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5870                                             total, new_sz,
5871                                             srcrawofs, new_srcofs, new_srcstep,
5872                                             dstrawofs, new_dstofs, new_dststep);
5873
5874 #ifdef HAVE_OPENCL_SVM
5875         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5876         {
5877             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5878             Context& ctx = Context::getDefault();
5879             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5880             CV_DbgAssert(svmFns->isValid());
5881
5882             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5883             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5884             {
5885                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5886                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5887                         u->handle, u->size,
5888                         0, NULL, NULL);
5889                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5890             }
5891             clFinish(q);
5892             if( iscontinuous )
5893             {
5894                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5895             }
5896             else
5897             {
5898                 // This code is from MatAllocator::download()
5899                 int isz[CV_MAX_DIM];
5900                 uchar* srcptr = (uchar*)u->handle;
5901                 for( int i = 0; i < dims; i++ )
5902                 {
5903                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5904                     if( sz[i] == 0 )
5905                     return;
5906                     if( srcofs )
5907                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5908                     isz[i] = (int)sz[i];
5909                 }
5910
5911                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5912                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5913
5914                 const Mat* arrays[] = { &src, &dst };
5915                 uchar* ptrs[2];
5916                 NAryMatIterator it(arrays, ptrs, 2);
5917                 size_t j, planesz = it.size;
5918
5919                 for( j = 0; j < it.nplanes; j++, ++it )
5920                     memcpy(ptrs[1], ptrs[0], planesz);
5921             }
5922             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5923             {
5924                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5925                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5926                         0, NULL, NULL);
5927                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5928                 clFinish(q);
5929             }
5930         }
5931         else
5932 #endif
5933         {
5934             if( iscontinuous )
5935             {
5936                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5937                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5938                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5939             }
5940             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5941             {
5942                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5943                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5944                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5945                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5946                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5947                 uchar* ptr = alignedPtr.getAlignedPtr();
5948
5949                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5950                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5951                 total = std::min(total, u->size - new_srcrawofs);
5952                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5953                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5954                 for( size_t i = 0; i < new_sz[1]; i++ )
5955                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5956             }
5957             else
5958             {
5959                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5960                 uchar* ptr = alignedPtr.getAlignedPtr();
5961
5962                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5963                     new_srcofs, new_dstofs, new_sz,
5964                     new_srcstep[0], 0,
5965                     new_dststep[0], 0,
5966                     ptr, 0, 0, 0));
5967             }
5968         }
5969     }
5970
5971     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5972                 const size_t dstofs[], const size_t dststep[],
5973                 const size_t srcstep[]) const CV_OVERRIDE
5974     {
5975         if(!u)
5976             return;
5977
5978         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5979         CV_Assert(u->refcount == 0 || u->tempUMat());
5980
5981         size_t total = 0, new_sz[] = {0, 0, 0};
5982         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5983         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5984
5985         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5986                                             total, new_sz,
5987                                             srcrawofs, new_srcofs, new_srcstep,
5988                                             dstrawofs, new_dstofs, new_dststep);
5989
5990         UMatDataAutoLock autolock(u);
5991
5992         // if there is cached CPU copy of the GPU matrix,
5993         // we could use it as a destination.
5994         // we can do it in 2 cases:
5995         //    1. we overwrite the whole content
5996         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5997         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5998         {
5999             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
6000             u->markHostCopyObsolete(false);
6001             u->markDeviceCopyObsolete(true);
6002             return;
6003         }
6004
6005         CV_Assert( u->handle != 0 );
6006         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6007
6008 #ifdef HAVE_OPENCL_SVM
6009         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6010         {
6011             CV_DbgAssert(u->data == NULL || u->data == u->handle);
6012             Context& ctx = Context::getDefault();
6013             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6014             CV_DbgAssert(svmFns->isValid());
6015
6016             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
6017             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6018             {
6019                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
6020                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
6021                         u->handle, u->size,
6022                         0, NULL, NULL);
6023                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6024             }
6025             clFinish(q);
6026             if( iscontinuous )
6027             {
6028                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6029             }
6030             else
6031             {
6032                 // This code is from MatAllocator::upload()
6033                 int isz[CV_MAX_DIM];
6034                 uchar* dstptr = (uchar*)u->handle;
6035                 for( int i = 0; i < dims; i++ )
6036                 {
6037                     CV_Assert( sz[i] <= (size_t)INT_MAX );
6038                     if( sz[i] == 0 )
6039                     return;
6040                     if( dstofs )
6041                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6042                     isz[i] = (int)sz[i];
6043                 }
6044
6045                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6046                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6047
6048                 const Mat* arrays[] = { &src, &dst };
6049                 uchar* ptrs[2];
6050                 NAryMatIterator it(arrays, ptrs, 2);
6051                 size_t j, planesz = it.size;
6052
6053                 for( j = 0; j < it.nplanes; j++, ++it )
6054                     memcpy(ptrs[1], ptrs[0], planesz);
6055             }
6056             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6057             {
6058                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6059                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6060                         0, NULL, NULL);
6061                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6062                 clFinish(q);
6063             }
6064         }
6065         else
6066 #endif
6067         {
6068             if( iscontinuous )
6069             {
6070                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6071                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6072                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6073                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6074                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6075             }
6076             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6077             {
6078                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6079                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6080                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6081                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6082                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6083                 uchar* ptr = alignedPtr.getAlignedPtr();
6084
6085                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6086                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6087                 total = std::min(total, u->size - new_dstrawofs);
6088                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6089                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6090                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6091                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6092                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6093                 for( size_t i = 0; i < new_sz[1]; i++ )
6094                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6095                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6096                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6097             }
6098             else
6099             {
6100                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6101                 uchar* ptr = alignedPtr.getAlignedPtr();
6102
6103                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6104                     new_dstofs, new_srcofs, new_sz,
6105                     new_dststep[0], 0,
6106                     new_srcstep[0], 0,
6107                     ptr, 0, 0, 0));
6108             }
6109         }
6110         u->markHostCopyObsolete(true);
6111 #ifdef HAVE_OPENCL_SVM
6112         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6113                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6114         {
6115             // nothing
6116         }
6117         else
6118 #endif
6119         {
6120             u->markHostCopyObsolete(true);
6121         }
6122         u->markDeviceCopyObsolete(false);
6123     }
6124
6125     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6126               const size_t srcofs[], const size_t srcstep[],
6127               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6128     {
6129         if(!src || !dst)
6130             return;
6131
6132         size_t total = 0, new_sz[] = {0, 0, 0};
6133         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6134         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6135
6136         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6137                                             total, new_sz,
6138                                             srcrawofs, new_srcofs, new_srcstep,
6139                                             dstrawofs, new_dstofs, new_dststep);
6140
6141         UMatDataAutoLock src_autolock(src, dst);
6142
6143         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6144         {
6145             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6146             return;
6147         }
6148         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6149         {
6150             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6151             dst->markHostCopyObsolete(false);
6152 #ifdef HAVE_OPENCL_SVM
6153             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6154                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6155             {
6156                 // nothing
6157             }
6158             else
6159 #endif
6160             {
6161                 dst->markDeviceCopyObsolete(true);
6162             }
6163             return;
6164         }
6165
6166         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6167         CV_Assert(dst->refcount == 0);
6168         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6169
6170         cl_int retval = CL_SUCCESS;
6171 #ifdef HAVE_OPENCL_SVM
6172         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6173                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6174         {
6175             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6176                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6177             {
6178                 Context& ctx = Context::getDefault();
6179                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6180                 CV_DbgAssert(svmFns->isValid());
6181
6182                 if( iscontinuous )
6183                 {
6184                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6185                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6186                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6187                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6188                             total, 0, NULL, NULL);
6189                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6190                 }
6191                 else
6192                 {
6193                     clFinish(q);
6194                     // This code is from MatAllocator::download()/upload()
6195                     int isz[CV_MAX_DIM];
6196                     uchar* srcptr = (uchar*)src->handle;
6197                     for( int i = 0; i < dims; i++ )
6198                     {
6199                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6200                         if( sz[i] == 0 )
6201                         return;
6202                         if( srcofs )
6203                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6204                         isz[i] = (int)sz[i];
6205                     }
6206                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6207
6208                     uchar* dstptr = (uchar*)dst->handle;
6209                     for( int i = 0; i < dims; i++ )
6210                     {
6211                         if( dstofs )
6212                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6213                     }
6214                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6215
6216                     const Mat* arrays[] = { &m_src, &m_dst };
6217                     uchar* ptrs[2];
6218                     NAryMatIterator it(arrays, ptrs, 2);
6219                     size_t j, planesz = it.size;
6220
6221                     for( j = 0; j < it.nplanes; j++, ++it )
6222                         memcpy(ptrs[1], ptrs[0], planesz);
6223                 }
6224             }
6225             else
6226             {
6227                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6228                 {
6229                     map(src, ACCESS_READ);
6230                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6231                     unmap(src);
6232                 }
6233                 else
6234                 {
6235                     map(dst, ACCESS_WRITE);
6236                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6237                     unmap(dst);
6238                 }
6239             }
6240         }
6241         else
6242 #endif
6243         {
6244             if( iscontinuous )
6245             {
6246                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6247                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6248                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6249                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6250             }
6251             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6252             {
6253                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6254                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6255                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6256                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6257                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6258
6259                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6260                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6261                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6262                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6263                 uchar* srcptr = srcBuf.getAlignedPtr();
6264                 uchar* dstptr = dstBuf.getAlignedPtr();
6265
6266                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6267
6268                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6269                 src_total = std::min(src_total, src->size - new_srcrawofs);
6270                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6271                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6272
6273                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6274                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6275                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6276                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6277
6278                 for( size_t i = 0; i < new_sz[1]; i++ )
6279                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6280                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6281                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6282                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6283             }
6284             else
6285             {
6286                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6287                                                    new_srcofs, new_dstofs, new_sz,
6288                                                    new_srcstep[0], 0,
6289                                                    new_dststep[0], 0,
6290                                                    0, 0, 0));
6291             }
6292         }
6293         if (retval == CL_SUCCESS)
6294         {
6295             CV_IMPL_ADD(CV_IMPL_OCL)
6296         }
6297
6298 #ifdef HAVE_OPENCL_SVM
6299         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6300             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6301         {
6302             // nothing
6303         }
6304         else
6305 #endif
6306         {
6307             dst->markHostCopyObsolete(true);
6308         }
6309         dst->markDeviceCopyObsolete(false);
6310
6311         if( _sync )
6312         {
6313             CV_OCL_DBG_CHECK(clFinish(q));
6314         }
6315     }
6316
6317     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6318     {
6319         ocl::Context ctx = Context::getDefault();
6320         if (ctx.empty())
6321             return NULL;
6322 #ifdef HAVE_OPENCL_SVM
6323         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6324         {
6325             return &ctx.getImpl()->getBufferPoolSVM();
6326         }
6327 #endif
6328         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6329         {
6330             return &ctx.getImpl()->getBufferPoolHostPtr();
6331         }
6332         if (id != NULL && strcmp(id, "OCL") != 0)
6333         {
6334             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6335         }
6336         return &ctx.getImpl()->getBufferPool();
6337     }
6338
6339     MatAllocator* matStdAllocator;
6340
6341     mutable cv::Mutex cleanupQueueMutex;
6342     mutable std::deque<UMatData*> cleanupQueue;
6343
6344     void flushCleanupQueue() const
6345     {
6346         if (!cleanupQueue.empty())
6347         {
6348             std::deque<UMatData*> q;
6349             {
6350                 cv::AutoLock lock(cleanupQueueMutex);
6351                 q.swap(cleanupQueue);
6352             }
6353             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6354             {
6355                 deallocate_(*i);
6356             }
6357         }
6358     }
6359     void addToCleanupQueue(UMatData* u) const
6360     {
6361         //TODO: Validation check: CV_Assert(!u->tempUMat());
6362         {
6363             cv::AutoLock lock(cleanupQueueMutex);
6364             cleanupQueue.push_back(u);
6365         }
6366     }
6367 };
6368
6369 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6370 {
6371     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6372     g_isOpenCVActivated = true;
6373     return g_allocator;
6374 }
6375 MatAllocator* getOpenCLAllocator()
6376 {
6377     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6378 }
6379
6380 }} // namespace cv::ocl
6381
6382
6383 namespace cv {
6384
6385 // three funcs below are implemented in umatrix.cpp
6386 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6387               bool autoSteps = false );
6388 void finalizeHdr(UMat& m);
6389
6390 } // namespace cv
6391
6392
6393 namespace cv { namespace ocl {
6394
6395 /*
6396 // Convert OpenCL buffer memory to UMat
6397 */
6398 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6399 {
6400     int d = 2;
6401     int sizes[] = { rows, cols };
6402
6403     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6404
6405     dst.release();
6406
6407     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6408     dst.usageFlags = USAGE_DEFAULT;
6409
6410     setSize(dst, d, sizes, 0, true);
6411     dst.offset = 0;
6412
6413     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6414     cl_mem_object_type mem_type = 0;
6415
6416     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6417
6418     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6419
6420     size_t total = 0;
6421     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6422
6423     CV_OCL_CHECK(clRetainMemObject(memobj));
6424
6425     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6426     CV_Assert(total >= rows * step);
6427
6428     // attach clBuffer to UMatData
6429     dst.u = new UMatData(getOpenCLAllocator());
6430     dst.u->data            = 0;
6431     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6432     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6433     dst.u->handle          = cl_mem_buffer;
6434     dst.u->origdata        = 0;
6435     dst.u->prevAllocator   = 0;
6436     dst.u->size            = total;
6437
6438     finalizeHdr(dst);
6439     dst.addref();
6440
6441     return;
6442 } // convertFromBuffer()
6443
6444
6445 /*
6446 // Convert OpenCL image2d_t memory to UMat
6447 */
6448 void convertFromImage(void* cl_mem_image, UMat& dst)
6449 {
6450     cl_mem             clImage = (cl_mem)cl_mem_image;
6451     cl_mem_object_type mem_type = 0;
6452
6453     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6454
6455     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6456
6457     cl_image_format fmt = { 0, 0 };
6458     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6459
6460     int depth = CV_8U;
6461     switch (fmt.image_channel_data_type)
6462     {
6463     case CL_UNORM_INT8:
6464     case CL_UNSIGNED_INT8:
6465         depth = CV_8U;
6466         break;
6467
6468     case CL_SNORM_INT8:
6469     case CL_SIGNED_INT8:
6470         depth = CV_8S;
6471         break;
6472
6473     case CL_UNORM_INT16:
6474     case CL_UNSIGNED_INT16:
6475         depth = CV_16U;
6476         break;
6477
6478     case CL_SNORM_INT16:
6479     case CL_SIGNED_INT16:
6480         depth = CV_16S;
6481         break;
6482
6483     case CL_SIGNED_INT32:
6484         depth = CV_32S;
6485         break;
6486
6487     case CL_FLOAT:
6488         depth = CV_32F;
6489         break;
6490
6491     default:
6492         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6493     }
6494
6495     int type = CV_8UC1;
6496     switch (fmt.image_channel_order)
6497     {
6498     case CL_R:
6499         type = CV_MAKE_TYPE(depth, 1);
6500         break;
6501
6502     case CL_RGBA:
6503     case CL_BGRA:
6504     case CL_ARGB:
6505         type = CV_MAKE_TYPE(depth, 4);
6506         break;
6507
6508     default:
6509         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6510         break;
6511     }
6512
6513     size_t step = 0;
6514     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6515
6516     size_t w = 0;
6517     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6518
6519     size_t h = 0;
6520     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6521
6522     dst.create((int)h, (int)w, type);
6523
6524     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6525
6526     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6527
6528     size_t offset = 0;
6529     size_t src_origin[3] = { 0, 0, 0 };
6530     size_t region[3] = { w, h, 1 };
6531     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6532
6533     CV_OCL_CHECK(clFinish(q));
6534
6535     return;
6536 } // convertFromImage()
6537
6538
6539 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6540
6541 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6542 {
6543     cl_uint numDevices = 0;
6544     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6545     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6546     {
6547         CV_OCL_DBG_CHECK_RESULT(status,
6548             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6549     }
6550
6551     if (numDevices == 0)
6552     {
6553         devices.clear();
6554         return;
6555     }
6556
6557     devices.resize((size_t)numDevices);
6558     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6559 }
6560
6561 struct PlatformInfo::Impl
6562 {
6563     Impl(void* id)
6564     {
6565         refcount = 1;
6566         handle = *(cl_platform_id*)id;
6567         getDevices(devices, handle);
6568     }
6569
6570     String getStrProp(cl_platform_info prop) const
6571     {
6572         char buf[1024];
6573         size_t sz=0;
6574         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6575             sz < sizeof(buf) ? String(buf) : String();
6576     }
6577
6578     IMPLEMENT_REFCOUNTABLE();
6579     std::vector<cl_device_id> devices;
6580     cl_platform_id handle;
6581 };
6582
6583 PlatformInfo::PlatformInfo()
6584 {
6585     p = 0;
6586 }
6587
6588 PlatformInfo::PlatformInfo(void* platform_id)
6589 {
6590     p = new Impl(platform_id);
6591 }
6592
6593 PlatformInfo::~PlatformInfo()
6594 {
6595     if(p)
6596         p->release();
6597 }
6598
6599 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6600 {
6601     if (i.p)
6602         i.p->addref();
6603     p = i.p;
6604 }
6605
6606 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6607 {
6608     if (i.p != p)
6609     {
6610         if (i.p)
6611             i.p->addref();
6612         if (p)
6613             p->release();
6614         p = i.p;
6615     }
6616     return *this;
6617 }
6618
6619 int PlatformInfo::deviceNumber() const
6620 {
6621     return p ? (int)p->devices.size() : 0;
6622 }
6623
6624 void PlatformInfo::getDevice(Device& device, int d) const
6625 {
6626     CV_Assert(p && d < (int)p->devices.size() );
6627     if(p)
6628         device.set(p->devices[d]);
6629 }
6630
6631 String PlatformInfo::name() const
6632 {
6633     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6634 }
6635
6636 String PlatformInfo::vendor() const
6637 {
6638     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6639 }
6640
6641 String PlatformInfo::version() const
6642 {
6643     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6644 }
6645
6646 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6647 {
6648     cl_uint numPlatforms = 0;
6649     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6650
6651     if (numPlatforms == 0)
6652     {
6653         platforms.clear();
6654         return;
6655     }
6656
6657     platforms.resize((size_t)numPlatforms);
6658     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6659 }
6660
6661 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6662 {
6663     std::vector<cl_platform_id> platforms;
6664     getPlatforms(platforms);
6665
6666     for (size_t i = 0; i < platforms.size(); i++)
6667         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6668 }
6669
6670 const char* typeToStr(int type)
6671 {
6672     static const char* tab[]=
6673     {
6674         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6675         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6676         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6677         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6678         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6679         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6680         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6681         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6682         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6683     };
6684     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6685     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6686     CV_Assert(result);
6687     return result;
6688 }
6689
6690 const char* memopTypeToStr(int type)
6691 {
6692     static const char* tab[] =
6693     {
6694         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6695         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6696         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6697         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6698         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6699         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6700         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6701         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6702         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6703     };
6704     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6705     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6706     CV_Assert(result);
6707     return result;
6708 }
6709
6710 const char* vecopTypeToStr(int type)
6711 {
6712     static const char* tab[] =
6713     {
6714         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6715         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6716         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6717         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6718         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6719         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6720         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6721         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6722         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6723     };
6724     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6725     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6726     CV_Assert(result);
6727     return result;
6728 }
6729
6730 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6731 {
6732     if( sdepth == ddepth )
6733         return "noconvert";
6734     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6735     if( ddepth >= CV_32F ||
6736         (ddepth == CV_32S && sdepth < CV_32S) ||
6737         (ddepth == CV_16S && sdepth <= CV_8S) ||
6738         (ddepth == CV_16U && sdepth == CV_8U))
6739     {
6740         sprintf(buf, "convert_%s", typestr);
6741     }
6742     else if( sdepth >= CV_32F )
6743         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6744     else
6745         sprintf(buf, "convert_%s_sat", typestr);
6746
6747     return buf;
6748 }
6749
6750 const char* getOpenCLErrorString(int errorCode)
6751 {
6752 #define CV_OCL_CODE(id) case id: return #id
6753 #define CV_OCL_CODE_(id, name) case id: return #name
6754     switch (errorCode)
6755     {
6756     CV_OCL_CODE(CL_SUCCESS);
6757     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6758     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6759     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6760     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6761     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6762     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6763     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6764     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6765     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6766     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6767     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6768     CV_OCL_CODE(CL_MAP_FAILURE);
6769     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6770     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6771     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6772     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6773     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6774     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6775     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6776     CV_OCL_CODE(CL_INVALID_VALUE);
6777     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6778     CV_OCL_CODE(CL_INVALID_PLATFORM);
6779     CV_OCL_CODE(CL_INVALID_DEVICE);
6780     CV_OCL_CODE(CL_INVALID_CONTEXT);
6781     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6782     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6783     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6784     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6785     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6786     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6787     CV_OCL_CODE(CL_INVALID_SAMPLER);
6788     CV_OCL_CODE(CL_INVALID_BINARY);
6789     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6790     CV_OCL_CODE(CL_INVALID_PROGRAM);
6791     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6792     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6793     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6794     CV_OCL_CODE(CL_INVALID_KERNEL);
6795     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6796     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6797     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6798     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6799     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6800     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6801     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6802     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6803     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6804     CV_OCL_CODE(CL_INVALID_EVENT);
6805     CV_OCL_CODE(CL_INVALID_OPERATION);
6806     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6807     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6808     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6809     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6810     // OpenCL 1.1
6811     CV_OCL_CODE(CL_INVALID_PROPERTY);
6812     // OpenCL 1.2
6813     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6814     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6815     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6816     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6817     // OpenCL 2.0
6818     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6819     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6820     // Extensions
6821     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6822     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6823     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6824     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6825     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6826     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6827     default: return "Unknown OpenCL error";
6828     }
6829 #undef CV_OCL_CODE
6830 #undef CV_OCL_CODE_
6831 }
6832
6833 template <typename T>
6834 static std::string kerToStr(const Mat & k)
6835 {
6836     int width = k.cols - 1, depth = k.depth();
6837     const T * const data = k.ptr<T>();
6838
6839     std::ostringstream stream;
6840     stream.precision(10);
6841
6842     if (depth <= CV_8S)
6843     {
6844         for (int i = 0; i < width; ++i)
6845             stream << "DIG(" << (int)data[i] << ")";
6846         stream << "DIG(" << (int)data[width] << ")";
6847     }
6848     else if (depth == CV_32F)
6849     {
6850         stream.setf(std::ios_base::showpoint);
6851         for (int i = 0; i < width; ++i)
6852             stream << "DIG(" << data[i] << "f)";
6853         stream << "DIG(" << data[width] << "f)";
6854     }
6855     else
6856     {
6857         for (int i = 0; i < width; ++i)
6858             stream << "DIG(" << data[i] << ")";
6859         stream << "DIG(" << data[width] << ")";
6860     }
6861
6862     return stream.str();
6863 }
6864
6865 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6866 {
6867     Mat kernel = _kernel.getMat().reshape(1, 1);
6868
6869     int depth = kernel.depth();
6870     if (ddepth < 0)
6871         ddepth = depth;
6872
6873     if (ddepth != depth)
6874         kernel.convertTo(kernel, ddepth);
6875
6876     typedef std::string (* func_t)(const Mat &);
6877     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6878                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6879     const func_t func = funcs[ddepth];
6880     CV_Assert(func != 0);
6881
6882     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6883 }
6884
6885 #define PROCESS_SRC(src) \
6886     do \
6887     { \
6888         if (!src.empty()) \
6889         { \
6890             CV_Assert(src.isMat() || src.isUMat()); \
6891             Size csize = src.size(); \
6892             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6893                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6894             if (cwidth < ckercn || ckercn <= 0) \
6895                 return 1; \
6896             cols.push_back(cwidth); \
6897             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6898                 return 1; \
6899             offsets.push_back(src.offset()); \
6900             steps.push_back(src.step()); \
6901             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6902             kercns.push_back(ckercn); \
6903         } \
6904     } \
6905     while ((void)0, 0)
6906
6907 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6908                               InputArray src4, InputArray src5, InputArray src6,
6909                               InputArray src7, InputArray src8, InputArray src9,
6910                               OclVectorStrategy strat)
6911 {
6912     const ocl::Device & d = ocl::Device::getDefault();
6913
6914     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6915         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6916         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6917         d.preferredVectorWidthDouble(), -1 };
6918
6919     // if the device says don't use vectors
6920     if (vectorWidths[0] == 1)
6921     {
6922         // it's heuristic
6923         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6924         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6925         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6926     }
6927
6928     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6929 }
6930
6931 int checkOptimalVectorWidth(const int *vectorWidths,
6932                             InputArray src1, InputArray src2, InputArray src3,
6933                             InputArray src4, InputArray src5, InputArray src6,
6934                             InputArray src7, InputArray src8, InputArray src9,
6935                             OclVectorStrategy strat)
6936 {
6937     CV_Assert(vectorWidths);
6938
6939     int ref_type = src1.type();
6940
6941     std::vector<size_t> offsets, steps, cols;
6942     std::vector<int> dividers, kercns;
6943     PROCESS_SRC(src1);
6944     PROCESS_SRC(src2);
6945     PROCESS_SRC(src3);
6946     PROCESS_SRC(src4);
6947     PROCESS_SRC(src5);
6948     PROCESS_SRC(src6);
6949     PROCESS_SRC(src7);
6950     PROCESS_SRC(src8);
6951     PROCESS_SRC(src9);
6952
6953     size_t size = offsets.size();
6954
6955     for (size_t i = 0; i < size; ++i)
6956         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6957             dividers[i] >>= 1, kercns[i] >>= 1;
6958
6959     // default strategy
6960     int kercn = *std::min_element(kercns.begin(), kercns.end());
6961
6962     return kercn;
6963 }
6964
6965 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6966                                  InputArray src4, InputArray src5, InputArray src6,
6967                                  InputArray src7, InputArray src8, InputArray src9)
6968 {
6969     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6970 }
6971
6972 #undef PROCESS_SRC
6973
6974
6975 // TODO Make this as a method of OpenCL "BuildOptions" class
6976 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6977 {
6978     if (!buildOptions.empty())
6979         buildOptions += " ";
6980     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6981     buildOptions += format(
6982             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6983             name.c_str(), ocl::typeToStr(type),
6984             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6985             name.c_str(), (int)CV_MAT_CN(type),
6986             name.c_str(), (int)CV_ELEM_SIZE(type),
6987             name.c_str(), (int)CV_ELEM_SIZE1(type),
6988             name.c_str(), (int)depth
6989             );
6990 }
6991
6992
6993 struct Image2D::Impl
6994 {
6995     Impl(const UMat &src, bool norm, bool alias)
6996     {
6997         handle = 0;
6998         refcount = 1;
6999         init(src, norm, alias);
7000     }
7001
7002     ~Impl()
7003     {
7004         if (handle)
7005             clReleaseMemObject(handle);
7006     }
7007
7008     static cl_image_format getImageFormat(int depth, int cn, bool norm)
7009     {
7010         cl_image_format format;
7011         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
7012                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
7013         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
7014                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
7015         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
7016
7017         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
7018         int channelOrder = channelOrders[cn];
7019         format.image_channel_data_type = (cl_channel_type)channelType;
7020         format.image_channel_order = (cl_channel_order)channelOrder;
7021         return format;
7022     }
7023
7024     static bool isFormatSupported(cl_image_format format)
7025     {
7026         if (!haveOpenCL())
7027             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7028
7029         cl_context context = (cl_context)Context::getDefault().ptr();
7030         if (!context)
7031             return false;
7032
7033         // Figure out how many formats are supported by this context.
7034         cl_uint numFormats = 0;
7035         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7036                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
7037                                                 NULL, &numFormats);
7038         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7039         if (numFormats > 0)
7040         {
7041             AutoBuffer<cl_image_format> formats(numFormats);
7042             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7043                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
7044                                              formats.data(), NULL);
7045             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7046             for (cl_uint i = 0; i < numFormats; ++i)
7047             {
7048                 if (!memcmp(&formats[i], &format, sizeof(format)))
7049                 {
7050                     return true;
7051                 }
7052             }
7053         }
7054         return false;
7055     }
7056
7057     void init(const UMat &src, bool norm, bool alias)
7058     {
7059         if (!haveOpenCL())
7060             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7061
7062         CV_Assert(!src.empty());
7063         CV_Assert(ocl::Device::getDefault().imageSupport());
7064
7065         int err, depth = src.depth(), cn = src.channels();
7066         CV_Assert(cn <= 4);
7067         cl_image_format format = getImageFormat(depth, cn, norm);
7068
7069         if (!isFormatSupported(format))
7070             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7071
7072         if (alias && !src.handle(ACCESS_RW))
7073             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7074
7075         cl_context context = (cl_context)Context::getDefault().ptr();
7076         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7077
7078 #ifdef CL_VERSION_1_2
7079         // this enables backwards portability to
7080         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7081         const Device & d = ocl::Device::getDefault();
7082         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7083         CV_Assert(!alias || canCreateAlias(src));
7084         if (1 < major || (1 == major && 2 <= minor))
7085         {
7086             cl_image_desc desc;
7087             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7088             desc.image_width      = src.cols;
7089             desc.image_height     = src.rows;
7090             desc.image_depth      = 0;
7091             desc.image_array_size = 1;
7092             desc.image_row_pitch  = alias ? src.step[0] : 0;
7093             desc.image_slice_pitch = 0;
7094             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7095             desc.num_mip_levels   = 0;
7096             desc.num_samples      = 0;
7097             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7098         }
7099         else
7100 #endif
7101         {
7102             CV_SUPPRESS_DEPRECATED_START
7103             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7104             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7105             CV_SUPPRESS_DEPRECATED_END
7106         }
7107         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7108
7109         size_t origin[] = { 0, 0, 0 };
7110         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7111
7112         cl_mem devData;
7113         if (!alias && !src.isContinuous())
7114         {
7115             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7116             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7117                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7118                 ).c_str());
7119
7120             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7121             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7122                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7123             CV_OCL_DBG_CHECK(clFlush(queue));
7124         }
7125         else
7126         {
7127             devData = (cl_mem)src.handle(ACCESS_READ);
7128         }
7129         CV_Assert(devData != NULL);
7130
7131         if (!alias)
7132         {
7133             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7134             if (!src.isContinuous())
7135             {
7136                 CV_OCL_DBG_CHECK(clFlush(queue));
7137                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7138             }
7139         }
7140     }
7141
7142     IMPLEMENT_REFCOUNTABLE();
7143
7144     cl_mem handle;
7145 };
7146
7147 Image2D::Image2D()
7148 {
7149     p = NULL;
7150 }
7151
7152 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7153 {
7154     p = new Impl(src, norm, alias);
7155 }
7156
7157 bool Image2D::canCreateAlias(const UMat &m)
7158 {
7159     bool ret = false;
7160     const Device & d = ocl::Device::getDefault();
7161     if (d.imageFromBufferSupport() && !m.empty())
7162     {
7163         // This is the required pitch alignment in pixels
7164         uint pitchAlign = d.imagePitchAlignment();
7165         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7166         {
7167             // We don't currently handle the case where the buffer was created
7168             // with CL_MEM_USE_HOST_PTR
7169             if (!m.u->tempUMat())
7170             {
7171                 ret = true;
7172             }
7173         }
7174     }
7175     return ret;
7176 }
7177
7178 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7179 {
7180     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7181
7182     return Impl::isFormatSupported(format);
7183 }
7184
7185 Image2D::Image2D(const Image2D & i)
7186 {
7187     p = i.p;
7188     if (p)
7189         p->addref();
7190 }
7191
7192 Image2D & Image2D::operator = (const Image2D & i)
7193 {
7194     if (i.p != p)
7195     {
7196         if (i.p)
7197             i.p->addref();
7198         if (p)
7199             p->release();
7200         p = i.p;
7201     }
7202     return *this;
7203 }
7204
7205 Image2D::~Image2D()
7206 {
7207     if (p)
7208         p->release();
7209 }
7210
7211 void* Image2D::ptr() const
7212 {
7213     return p ? p->handle : 0;
7214 }
7215
7216 bool internal::isOpenCLForced()
7217 {
7218     static bool initialized = false;
7219     static bool value = false;
7220     if (!initialized)
7221     {
7222         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7223         initialized = true;
7224     }
7225     return value;
7226 }
7227
7228 bool internal::isPerformanceCheckBypassed()
7229 {
7230     static bool initialized = false;
7231     static bool value = false;
7232     if (!initialized)
7233     {
7234         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7235         initialized = true;
7236     }
7237     return value;
7238 }
7239
7240 bool internal::isCLBuffer(UMat& u)
7241 {
7242     void* h = u.handle(ACCESS_RW);
7243     if (!h)
7244         return true;
7245     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7246 #if 1
7247     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7248         return false;
7249 #else
7250     cl_mem_object_type type = 0;
7251     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7252     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7253         return false;
7254 #endif
7255     return true;
7256 }
7257
7258 struct Timer::Impl
7259 {
7260     const Queue queue;
7261
7262     Impl(const Queue& q)
7263         : queue(q)
7264     {
7265     }
7266
7267     ~Impl(){}
7268
7269     void start()
7270     {
7271         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7272         timer.start();
7273     }
7274
7275     void stop()
7276     {
7277         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7278         timer.stop();
7279     }
7280
7281     uint64 durationNS() const
7282     {
7283         return (uint64)(timer.getTimeSec() * 1e9);
7284     }
7285
7286     TickMeter timer;
7287 };
7288
7289 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7290 Timer::~Timer() { delete p; }
7291
7292 void Timer::start()
7293 {
7294     CV_Assert(p);
7295     p->start();
7296 }
7297
7298 void Timer::stop()
7299 {
7300     CV_Assert(p);
7301     p->stop();
7302 }
7303
7304 uint64 Timer::durationNS() const
7305 {
7306     CV_Assert(p);
7307     return p->durationNS();
7308 }
7309
7310 }} // namespace
7311
7312 #ifdef HAVE_DIRECTX
7313 namespace cv { namespace directx { namespace internal {
7314 OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
7315 {
7316     ocl::Context::Impl* i = ctx.getImpl();
7317     CV_Assert(i);
7318     return i->getDirectXImpl();
7319 }
7320 }}} // namespace cv::directx::internal
7321 #endif
7322
7323 #endif // HAVE_OPENCL