Merge remote-tracking branch 'upstream/3.4' into merge-3.4
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_DIRECTX
117 #include "directx.hpp"
118 #endif
119
120 #ifdef HAVE_OPENCL_SVM
121 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
122 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
123 #include "opencv2/core/opencl/opencl_svm.hpp"
124 #endif
125
126 #include "umatrix.hpp"
127
128 namespace cv { namespace ocl {
129
130 #define IMPLEMENT_REFCOUNTABLE() \
131     void addref() { CV_XADD(&refcount, 1); } \
132     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
133     int refcount
134
135 static cv::utils::AllocatorStatistics opencl_allocator_stats;
136
137 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
138 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
139 {
140     return opencl_allocator_stats;
141 }
142
143 #ifndef _DEBUG
144 static bool isRaiseError()
145 {
146     static bool initialized = false;
147     static bool value = false;
148     if (!initialized)
149     {
150         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
151         initialized = true;
152     }
153     return value;
154 }
155 #endif
156
157 #if CV_OPENCL_TRACE_CHECK
158 static inline
159 void traceOpenCLCheck(cl_int status, const char* message)
160 {
161     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
162 }
163 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
164 #else
165 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
166 #endif
167
168 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
169     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
170
171 #define CV_OCL_CHECK_RESULT(check_result, msg) \
172     do { \
173         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
174         if (check_result != CL_SUCCESS) \
175         { \
176             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
177             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
178             CV_Error(Error::OpenCLApiCallError, error_msg); \
179         } \
180     } while (0)
181
182 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
183
184 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
185
186 #ifdef _DEBUG
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
188 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
189 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
190 #else
191 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
192     do { \
193         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
194         if (check_result != CL_SUCCESS && isRaiseError()) \
195         { \
196             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
197             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
198             CV_Error(Error::OpenCLApiCallError, error_msg); \
199         } \
200     } while (0)
201 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
202 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
203 #endif
204
205
206 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
207 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
208 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
209 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
210
211 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
212 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
213 #endif
214
215 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
216 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
217 #ifdef __APPLE__
218         true
219 #else
220         false
221 #endif
222 );
223
224 static const String getBuildExtraOptions()
225 {
226     static String param_buildExtraOptions;
227     static bool initialized = false;
228     if (!initialized)
229     {
230         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
231         initialized = true;
232         if (!param_buildExtraOptions.empty())
233             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
234     }
235     return param_buildExtraOptions;
236 }
237
238 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
239 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
240
241
242 struct UMat2D
243 {
244     UMat2D(const UMat& m)
245     {
246         offset = (int)m.offset;
247         step = (int)m.step;
248         rows = m.rows;
249         cols = m.cols;
250     }
251     int offset;
252     int step;
253     int rows;
254     int cols;
255 };
256
257 struct UMat3D
258 {
259     UMat3D(const UMat& m)
260     {
261         offset = (int)m.offset;
262         step = (int)m.step.p[1];
263         slicestep = (int)m.step.p[0];
264         slices = (int)m.size.p[0];
265         rows = m.size.p[1];
266         cols = m.size.p[2];
267     }
268     int offset;
269     int slicestep;
270     int step;
271     int slices;
272     int rows;
273     int cols;
274 };
275
276 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
277 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
278 {
279     static uint64 table[256];
280     static bool initialized = false;
281
282     if( !initialized )
283     {
284         for( int i = 0; i < 256; i++ )
285         {
286             uint64 c = i;
287             for( int j = 0; j < 8; j++ )
288                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
289             table[i] = c;
290         }
291         initialized = true;
292     }
293
294     uint64 crc = ~crc0;
295     for( size_t idx = 0; idx < size; idx++ )
296         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
297
298     return ~crc;
299 }
300
301 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
302 struct OpenCLBinaryCacheConfigurator
303 {
304     cv::String cache_path_;
305     cv::String cache_lock_filename_;
306     cv::Ptr<utils::fs::FileLock> cache_lock_;
307
308     typedef std::map<std::string, std::string> ContextCacheType;
309     ContextCacheType prepared_contexts_;
310     Mutex mutex_prepared_contexts_;
311
312     OpenCLBinaryCacheConfigurator()
313     {
314         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
315         if (!CV_OPENCL_CACHE_ENABLE)
316         {
317             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
318             return;
319         }
320         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
321         if (cache_path_.empty())
322         {
323             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
324         }
325         do
326         {
327             try
328             {
329                 if (cache_path_.empty())
330                     break;
331                 if (cache_path_ == "disabled")
332                     break;
333                 if (!utils::fs::createDirectories(cache_path_))
334                 {
335                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
336                     clear();
337                     break;
338                 }
339
340                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
341                 {
342                     cache_lock_filename_ = cache_path_ + ".lock";
343                     if (!utils::fs::exists(cache_lock_filename_))
344                     {
345                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
346                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
347                         if (!lock_filename.is_open())
348                         {
349                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
350                             break;
351                         }
352                     }
353
354                     try
355                     {
356                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
358                         {
359                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
360                         }
361                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
362                     }
363                     catch (const cv::Exception& e)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
366                     }
367                     catch (...)
368                     {
369                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
370                     }
371                 }
372                 else
373                 {
374                     if (CV_OPENCL_CACHE_WRITE)
375                     {
376                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
377                                 "(not safe for multiprocess environment)");
378                     }
379                     else
380                     {
381                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
382                     }
383                 }
384             }
385             catch (const cv::Exception& e)
386             {
387                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
388                 clear();
389             }
390         } while (0);
391         if (!cache_path_.empty())
392         {
393             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
394             {
395                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
396                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
397             }
398             else
399             {
400                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
401             }
402         }
403     }
404
405     void clear()
406     {
407         cache_path_.clear();
408         cache_lock_filename_.clear();
409         cache_lock_.release();
410     }
411
412     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
413             const std::string& cleanup_prefix)
414     {
415         if (cache_path_.empty())
416             return std::string();
417
418         AutoLock lock(mutex_prepared_contexts_);
419
420         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
421         if (found_it != prepared_contexts_.end())
422             return found_it->second;
423
424         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
425
426         std::string target_directory = cache_path_ + ctx_prefix + "/";
427         bool result = utils::fs::isDirectory(target_directory);
428         if (!result)
429         {
430             try
431             {
432                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
433                 if (utils::fs::createDirectories(target_directory))
434                 {
435                     result = true;
436                 }
437                 else
438                 {
439                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
440                 }
441             }
442             catch (const cv::Exception& e)
443             {
444                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
445             }
446         }
447         target_directory = result ? target_directory : std::string();
448         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
449
450         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
451         {
452             try
453             {
454                 std::vector<String> entries;
455                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
456                 std::vector<String> remove_entries;
457                 for (size_t i = 0; i < entries.size(); i++)
458                 {
459                     const String& name = entries[i];
460                     if (0 == name.find(cleanup_prefix))
461                     {
462                         if (0 == name.find(ctx_prefix))
463                             continue; // skip current
464                         remove_entries.push_back(name);
465                     }
466                 }
467                 if (!remove_entries.empty())
468                 {
469                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
470                             ? "Detected OpenCL cache directory for other version of OpenCL device."
471                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
472                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
473                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
474                     for (size_t i = 0; i < remove_entries.size(); i++)
475                     {
476                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
477                     }
478                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
479
480                     for (size_t i = 0; i < remove_entries.size(); i++)
481                     {
482                         const String& name = remove_entries[i];
483                         cv::String path = utils::fs::join(cache_path_, name);
484                         try
485                         {
486                             utils::fs::remove_all(path);
487                             CV_LOG_WARNING(NULL, "Removed: " << path);
488                         }
489                         catch (const cv::Exception& e)
490                         {
491                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
492                         }
493                     }
494                 }
495             }
496             catch (...)
497             {
498                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
499             }
500         }
501
502         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
503         return target_directory;
504     }
505
506     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
507     {
508         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
509     }
510 };
511 class BinaryProgramFile
512 {
513     enum { MAX_ENTRIES = 64 };
514
515     typedef unsigned int uint32_t;
516
517     struct CV_DECL_ALIGNED(4) FileHeader
518     {
519         uint32_t sourceSignatureSize;
520         //char sourceSignature[];
521     };
522
523     struct CV_DECL_ALIGNED(4) FileTable
524     {
525         uint32_t numberOfEntries;
526         //uint32_t firstEntryOffset[];
527     };
528
529     struct CV_DECL_ALIGNED(4) FileEntry
530     {
531         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
532         uint32_t keySize;
533         uint32_t dataSize;
534         //char key[];
535         //char data[];
536     };
537
538     const std::string fileName_;
539     const char* const sourceSignature_;
540     const size_t sourceSignatureSize_;
541
542     std::fstream f;
543
544     uint32_t entryOffsets[MAX_ENTRIES];
545
546     uint32_t getHash(const std::string& options)
547     {
548         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
549         return hash & (MAX_ENTRIES - 1);
550     }
551
552     inline size_t getFileSize()
553     {
554         size_t pos = (size_t)f.tellg();
555         f.seekg(0, std::fstream::end);
556         size_t fileSize = (size_t)f.tellg();
557         f.seekg(pos, std::fstream::beg);
558         return fileSize;
559     }
560     inline uint32_t readUInt32()
561     {
562         uint32_t res = 0;
563         f.read((char*)&res, sizeof(uint32_t));
564         CV_Assert(!f.fail());
565         return res;
566     }
567     inline void writeUInt32(const uint32_t value)
568     {
569         uint32_t v = value;
570         f.write((char*)&v, sizeof(uint32_t));
571         CV_Assert(!f.fail());
572     }
573
574     inline void seekReadAbsolute(size_t pos)
575     {
576         f.seekg(pos, std::fstream::beg);
577         CV_Assert(!f.fail());
578     }
579     inline void seekReadRelative(size_t pos)
580     {
581         f.seekg(pos, std::fstream::cur);
582         CV_Assert(!f.fail());
583     }
584
585     inline void seekWriteAbsolute(size_t pos)
586     {
587         f.seekp(pos, std::fstream::beg);
588         CV_Assert(!f.fail());
589     }
590
591     void clearFile()
592     {
593         f.close();
594         if (0 != remove(fileName_.c_str()))
595             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
596         return;
597     }
598
599 public:
600     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
601         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
602     {
603         CV_StaticAssert(sizeof(uint32_t) == 4, "");
604         CV_Assert(sourceSignature_ != NULL);
605         CV_Assert(sourceSignatureSize_ > 0);
606         memset(entryOffsets, 0, sizeof(entryOffsets));
607
608         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
609         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
610         if(f.is_open() && getFileSize() > 0)
611         {
612             bool isValid = false;
613             try
614             {
615                 uint32_t fileSourceSignatureSize = readUInt32();
616                 if (fileSourceSignatureSize == sourceSignatureSize_)
617                 {
618                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
619                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
620                     if (f.eof())
621                     {
622                         CV_LOG_ERROR(NULL, "Unexpected EOF");
623                     }
624                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
625                     {
626                         isValid = true;
627                     }
628                 }
629                 if (!isValid)
630                 {
631                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
632                 }
633             }
634             catch (const cv::Exception& e)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
637             }
638             catch (...)
639             {
640                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
641             }
642             if (!isValid)
643             {
644                 clearFile();
645             }
646             else
647             {
648                 seekReadAbsolute(0);
649             }
650         }
651     }
652
653     bool read(const std::string& key, std::vector<char>& buf)
654     {
655         if (!f.is_open())
656             return false;
657
658         size_t fileSize = getFileSize();
659         if (fileSize == 0)
660         {
661             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
662             clearFile();
663             return false;
664         }
665         seekReadAbsolute(0);
666
667         // bypass FileHeader
668         uint32_t fileSourceSignatureSize = readUInt32();
669         CV_Assert(fileSourceSignatureSize > 0);
670         seekReadRelative(fileSourceSignatureSize);
671
672         uint32_t numberOfEntries = readUInt32();
673         CV_Assert(numberOfEntries > 0);
674         if (numberOfEntries != MAX_ENTRIES)
675         {
676             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
677             clearFile();
678             return false;
679         }
680         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
681         CV_Assert(!f.fail());
682
683         uint32_t entryNum = getHash(key);
684
685         uint32_t entryOffset = entryOffsets[entryNum];
686         FileEntry entry;
687         while (entryOffset > 0)
688         {
689             seekReadAbsolute(entryOffset);
690             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
691             f.read((char*)&entry, sizeof(entry));
692             CV_Assert(!f.fail());
693             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
694             if (key.size() == entry.keySize)
695             {
696                 if (entry.keySize > 0)
697                 {
698                     f.read(fileKey.data(), entry.keySize);
699                     CV_Assert(!f.fail());
700                 }
701                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
702                 {
703                     buf.resize(entry.dataSize);
704                     f.read(&buf[0], entry.dataSize);
705                     CV_Assert(!f.fail());
706                     seekReadAbsolute(0);
707                     CV_LOG_VERBOSE(NULL, 0, "Read...");
708                     return true;
709                 }
710             }
711             if (entry.nextEntryFileOffset == 0)
712                 break;
713             entryOffset = entry.nextEntryFileOffset;
714         }
715         return false;
716     }
717
718     bool write(const std::string& key, std::vector<char>& buf)
719     {
720         if (!f.is_open())
721         {
722             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
723             if (!f.is_open())
724             {
725                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
726                 if (!f.is_open())
727                 {
728                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
729                     return false;
730                 }
731             }
732         }
733
734         size_t fileSize = getFileSize();
735         if (fileSize == 0)
736         {
737             // Write header
738             seekWriteAbsolute(0);
739             writeUInt32((uint32_t)sourceSignatureSize_);
740             f.write(sourceSignature_, sourceSignatureSize_);
741             CV_Assert(!f.fail());
742
743             writeUInt32(MAX_ENTRIES);
744             memset(entryOffsets, 0, sizeof(entryOffsets));
745             f.write((char*)entryOffsets, sizeof(entryOffsets));
746             CV_Assert(!f.fail());
747             f.flush();
748             CV_Assert(!f.fail());
749             f.close();
750             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
751             CV_Assert(f.is_open());
752             fileSize = getFileSize();
753         }
754         seekReadAbsolute(0);
755
756         // bypass FileHeader
757         uint32_t fileSourceSignatureSize = readUInt32();
758         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
759         seekReadRelative(fileSourceSignatureSize);
760
761         uint32_t numberOfEntries = readUInt32();
762         CV_Assert(numberOfEntries > 0);
763         if (numberOfEntries != MAX_ENTRIES)
764         {
765             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
766             clearFile();
767             return false;
768         }
769         size_t tableEntriesOffset = (size_t)f.tellg();
770         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
771         CV_Assert(!f.fail());
772
773         uint32_t entryNum = getHash(key);
774
775         uint32_t entryOffset = entryOffsets[entryNum];
776         FileEntry entry;
777         while (entryOffset > 0)
778         {
779             seekReadAbsolute(entryOffset);
780             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
781             f.read((char*)&entry, sizeof(entry));
782             CV_Assert(!f.fail());
783             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
784             if (key.size() == entry.keySize)
785             {
786                 if (entry.keySize > 0)
787                 {
788                     f.read(fileKey.data(), entry.keySize);
789                     CV_Assert(!f.fail());
790                 }
791                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
792                 {
793                     // duplicate
794                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
795                     return false;
796                 }
797             }
798             if (entry.nextEntryFileOffset == 0)
799                 break;
800             entryOffset = entry.nextEntryFileOffset;
801         }
802         seekReadAbsolute(0);
803         if (entryOffset > 0)
804         {
805             seekWriteAbsolute(entryOffset);
806             entry.nextEntryFileOffset = (uint32_t)fileSize;
807             f.write((char*)&entry, sizeof(entry));
808             CV_Assert(!f.fail());
809         }
810         else
811         {
812             entryOffsets[entryNum] = (uint32_t)fileSize;
813             seekWriteAbsolute(tableEntriesOffset);
814             f.write((char*)entryOffsets, sizeof(entryOffsets));
815             CV_Assert(!f.fail());
816         }
817         seekWriteAbsolute(fileSize);
818         entry.nextEntryFileOffset = 0;
819         entry.dataSize = (uint32_t)buf.size();
820         entry.keySize = (uint32_t)key.size();
821         f.write((char*)&entry, sizeof(entry));
822         CV_Assert(!f.fail());
823         f.write(key.c_str(), entry.keySize);
824         CV_Assert(!f.fail());
825         f.write(&buf[0], entry.dataSize);
826         CV_Assert(!f.fail());
827         f.flush();
828         CV_Assert(!f.fail());
829         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
830         return true;
831     }
832 };
833 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
834
835
836
837 struct OpenCLExecutionContext::Impl
838 {
839     ocl::Context context_;
840     int device_;  // device index in context
841     ocl::Queue queue_;
842     int useOpenCL_;
843
844 protected:
845     Impl() = delete;
846
847     void _init_device(cl_device_id deviceID)
848     {
849         CV_Assert(deviceID);
850         int ndevices = (int)context_.ndevices();
851         CV_Assert(ndevices > 0);
852         bool found = false;
853         for (int i = 0; i < ndevices; i++)
854         {
855             ocl::Device d = context_.device(i);
856             cl_device_id dhandle = (cl_device_id)d.ptr();
857             if (dhandle == deviceID)
858             {
859                 device_ = i;
860                 found = true;
861                 break;
862             }
863         }
864         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
865     }
866
867     void _init_device(const ocl::Device& device)
868     {
869         CV_Assert(device.ptr());
870         int ndevices = (int)context_.ndevices();
871         CV_Assert(ndevices > 0);
872         bool found = false;
873         for (int i = 0; i < ndevices; i++)
874         {
875             ocl::Device d = context_.device(i);
876             if (d.getImpl() == device.getImpl())
877             {
878                 device_ = i;
879                 found = true;
880                 break;
881             }
882         }
883         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
884     }
885
886 public:
887     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
888         : device_(0), useOpenCL_(-1)
889     {
890         CV_UNUSED(platformID);
891         CV_Assert(context);
892         CV_Assert(deviceID);
893
894         context_ = Context::fromHandle(context);
895         _init_device(deviceID);
896         queue_ = Queue(context_, context_.device(device_));
897     }
898
899     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
900         : device_(0), useOpenCL_(-1)
901     {
902         CV_Assert(context.ptr());
903         CV_Assert(device.ptr());
904
905         context_ = context;
906         _init_device(device);
907         queue_ = queue;
908     }
909
910     Impl(const ocl::Context& context, const ocl::Device& device)
911         : device_(0), useOpenCL_(-1)
912     {
913         CV_Assert(context.ptr());
914         CV_Assert(device.ptr());
915
916         context_ = context;
917         _init_device(device);
918         queue_ = Queue(context_, context_.device(device_));
919     }
920
921     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
922         : context_(context)
923         , device_(device)
924         , queue_(queue)
925         , useOpenCL_(-1)
926     {
927         // nothing
928     }
929     Impl(const Impl& other)
930         : context_(other.context_)
931         , device_(other.device_)
932         , queue_(other.queue_)
933         , useOpenCL_(-1)
934     {
935         // nothing
936     }
937
938     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
939     bool useOpenCL()
940     {
941         if (useOpenCL_ < 0)
942         {
943             try
944             {
945                 useOpenCL_ = 0;
946                 if (!context_.empty() && context_.ndevices() > 0)
947                 {
948                     const Device& d = context_.device(device_);
949                     useOpenCL_ = d.available();
950                 }
951             }
952             catch (const cv::Exception&)
953             {
954                 // nothing
955             }
956             if (!useOpenCL_)
957                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
958         }
959         return useOpenCL_ > 0;
960     }
961
962     void setUseOpenCL(bool flag)
963     {
964         if (!flag)
965             useOpenCL_ = 0;
966         else
967             useOpenCL_ = -1;
968     }
969
970     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
971     {
972         CV_TRACE_FUNCTION();
973
974         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
975
976         static bool initialized = false;
977         static std::shared_ptr<Impl> g_primaryExecutionContext;
978
979         if (!initialized)
980         {
981             cv::AutoLock lock(getInitializationMutex());
982             if (!initialized)
983             {
984                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
985                 try
986                 {
987                     Context c = ocl::Context::create(std::string());
988                     if (c.ndevices())
989                     {
990                         int deviceId = 0;
991                         auto& d = c.device(deviceId);
992                         if (d.available())
993                         {
994                             auto q = ocl::Queue(c, d);
995                             if (!q.ptr())
996                             {
997                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
998                             }
999                             else
1000                             {
1001                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
1002                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
1003                             }
1004                         }
1005                         else
1006                         {
1007                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1008                         }
1009                     }
1010                     else
1011                     {
1012                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1013                     }
1014                 }
1015                 catch (const std::exception& e)
1016                 {
1017                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1018                 }
1019                 catch (...)
1020                 {
1021                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1022                 }
1023                 initialized = true;
1024             }
1025         }
1026         return g_primaryExecutionContext;
1027     }
1028 };
1029
1030 Context& OpenCLExecutionContext::getContext() const
1031 {
1032     CV_Assert(p);
1033     return p->context_;
1034 }
1035 Device& OpenCLExecutionContext::getDevice() const
1036 {
1037     CV_Assert(p);
1038     return p->context_.device(p->device_);
1039 }
1040 Queue& OpenCLExecutionContext::getQueue() const
1041 {
1042     CV_Assert(p);
1043     return p->queue_;
1044 }
1045
1046 bool OpenCLExecutionContext::useOpenCL() const
1047 {
1048     if (p)
1049         return p->useOpenCL();
1050     return false;
1051 }
1052 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1053 {
1054     CV_Assert(p);
1055     p->setUseOpenCL(flag);
1056 }
1057
1058 /* static */
1059 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1060 {
1061     CV_TRACE_FUNCTION();
1062     CoreTLSData& data = getCoreTlsData();
1063     OpenCLExecutionContext& c = data.oclExecutionContext;
1064     if (!data.oclExecutionContextInitialized)
1065     {
1066         data.oclExecutionContextInitialized = true;
1067         if (c.empty() && haveOpenCL())
1068             c.p = Impl::getInitializedExecutionContext();
1069     }
1070     return c;
1071 }
1072
1073 /* static */
1074 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1075 {
1076     CV_TRACE_FUNCTION();
1077     CoreTLSData& data = getCoreTlsData();
1078     OpenCLExecutionContext& c = data.oclExecutionContext;
1079     return c;
1080 }
1081
1082 void OpenCLExecutionContext::bind() const
1083 {
1084     CV_TRACE_FUNCTION();
1085     CV_Assert(p);
1086     CoreTLSData& data = getCoreTlsData();
1087     data.oclExecutionContext = *this;
1088     data.oclExecutionContextInitialized = true;
1089     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1090 }
1091
1092
1093 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1094 {
1095     CV_TRACE_FUNCTION();
1096     CV_Assert(p);
1097     const Queue q(getContext(), getDevice());
1098     return cloneWithNewQueue(q);
1099 }
1100
1101 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1102 {
1103     CV_TRACE_FUNCTION();
1104     CV_Assert(p);
1105     CV_Assert(q.ptr() != NULL);
1106     OpenCLExecutionContext c;
1107     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1108     return c;
1109 }
1110
1111 /* static */
1112 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1113 {
1114     CV_TRACE_FUNCTION();
1115     if (!haveOpenCL())
1116         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1117
1118     CV_Assert(!context.empty());
1119     CV_Assert(context.ptr());
1120     CV_Assert(!device.empty());
1121     CV_Assert(device.ptr());
1122     OpenCLExecutionContext ctx;
1123     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1124     return ctx;
1125
1126 }
1127
1128 /* static */
1129 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1130 {
1131     CV_TRACE_FUNCTION();
1132     if (!haveOpenCL())
1133         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1134
1135     CV_Assert(!context.empty());
1136     CV_Assert(context.ptr());
1137     CV_Assert(!device.empty());
1138     CV_Assert(device.ptr());
1139     OpenCLExecutionContext ctx;
1140     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1141     return ctx;
1142
1143 }
1144
1145 void OpenCLExecutionContext::release()
1146 {
1147     CV_TRACE_FUNCTION();
1148     p.reset();
1149 }
1150
1151
1152 // true if we have initialized OpenCL subsystem with available platforms
1153 static bool g_isOpenCVActivated = false;
1154
1155 bool haveOpenCL()
1156 {
1157     CV_TRACE_FUNCTION();
1158     static bool g_isOpenCLInitialized = false;
1159     static bool g_isOpenCLAvailable = false;
1160
1161     if (!g_isOpenCLInitialized)
1162     {
1163         CV_TRACE_REGION("Init_OpenCL_Runtime");
1164         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1165         if (envPath)
1166         {
1167             if (cv::String(envPath) == "disabled")
1168             {
1169                 g_isOpenCLAvailable = false;
1170                 g_isOpenCLInitialized = true;
1171                 return false;
1172             }
1173         }
1174
1175         cv::AutoLock lock(getInitializationMutex());
1176         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1177         try
1178         {
1179             cl_uint n = 0;
1180             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1181             g_isOpenCVActivated = n > 0;
1182             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1183         }
1184         catch (...)
1185         {
1186             g_isOpenCLAvailable = false;
1187         }
1188         g_isOpenCLInitialized = true;
1189     }
1190     return g_isOpenCLAvailable;
1191 }
1192
1193 bool useOpenCL()
1194 {
1195     CoreTLSData& data = getCoreTlsData();
1196     if (data.useOpenCL < 0)
1197     {
1198         try
1199         {
1200             data.useOpenCL = 0;
1201             if (haveOpenCL())
1202             {
1203                 auto c = OpenCLExecutionContext::getCurrent();
1204                 data.useOpenCL = c.useOpenCL();
1205             }
1206         }
1207         catch (...)
1208         {
1209             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1210         }
1211     }
1212     return data.useOpenCL > 0;
1213 }
1214
1215 bool isOpenCLActivated()
1216 {
1217     if (!g_isOpenCVActivated)
1218         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1219     return useOpenCL();
1220 }
1221
1222 void setUseOpenCL(bool flag)
1223 {
1224     CV_TRACE_FUNCTION();
1225
1226     CoreTLSData& data = getCoreTlsData();
1227     auto& c = OpenCLExecutionContext::getCurrentRef();
1228     if (!c.empty())
1229     {
1230         c.setUseOpenCL(flag);
1231         data.useOpenCL = c.useOpenCL();
1232     }
1233     else
1234     {
1235         if (!flag)
1236             data.useOpenCL = 0;
1237         else
1238             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1239     }
1240 }
1241
1242
1243
1244 #ifdef HAVE_CLAMDBLAS
1245
1246 class AmdBlasHelper
1247 {
1248 public:
1249     static AmdBlasHelper & getInstance()
1250     {
1251         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1252     }
1253
1254     bool isAvailable() const
1255     {
1256         return g_isAmdBlasAvailable;
1257     }
1258
1259     ~AmdBlasHelper()
1260     {
1261         try
1262         {
1263             clAmdBlasTeardown();
1264         }
1265         catch (...) { }
1266     }
1267
1268 protected:
1269     AmdBlasHelper()
1270     {
1271         if (!g_isAmdBlasInitialized)
1272         {
1273             AutoLock lock(getInitializationMutex());
1274
1275             if (!g_isAmdBlasInitialized)
1276             {
1277                 if (haveOpenCL())
1278                 {
1279                     try
1280                     {
1281                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1282                     }
1283                     catch (...)
1284                     {
1285                         g_isAmdBlasAvailable = false;
1286                     }
1287                 }
1288                 else
1289                     g_isAmdBlasAvailable = false;
1290
1291                 g_isAmdBlasInitialized = true;
1292             }
1293         }
1294     }
1295
1296 private:
1297     static bool g_isAmdBlasInitialized;
1298     static bool g_isAmdBlasAvailable;
1299 };
1300
1301 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1302 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1303
1304 bool haveAmdBlas()
1305 {
1306     return AmdBlasHelper::getInstance().isAvailable();
1307 }
1308
1309 #else
1310
1311 bool haveAmdBlas()
1312 {
1313     return false;
1314 }
1315
1316 #endif
1317
1318 #ifdef HAVE_CLAMDFFT
1319
1320 class AmdFftHelper
1321 {
1322 public:
1323     static AmdFftHelper & getInstance()
1324     {
1325         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1326     }
1327
1328     bool isAvailable() const
1329     {
1330         return g_isAmdFftAvailable;
1331     }
1332
1333     ~AmdFftHelper()
1334     {
1335         try
1336         {
1337 //            clAmdFftTeardown();
1338         }
1339         catch (...) { }
1340     }
1341
1342 protected:
1343     AmdFftHelper()
1344     {
1345         if (!g_isAmdFftInitialized)
1346         {
1347             AutoLock lock(getInitializationMutex());
1348
1349             if (!g_isAmdFftInitialized)
1350             {
1351                 if (haveOpenCL())
1352                 {
1353                     try
1354                     {
1355                         cl_uint major, minor, patch;
1356                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1357
1358                         // it throws exception in case AmdFft binaries are not found
1359                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1360                         g_isAmdFftAvailable = true;
1361                     }
1362                     catch (const Exception &)
1363                     {
1364                         g_isAmdFftAvailable = false;
1365                     }
1366                 }
1367                 else
1368                     g_isAmdFftAvailable = false;
1369
1370                 g_isAmdFftInitialized = true;
1371             }
1372         }
1373     }
1374
1375 private:
1376     static clAmdFftSetupData setupData;
1377     static bool g_isAmdFftInitialized;
1378     static bool g_isAmdFftAvailable;
1379 };
1380
1381 clAmdFftSetupData AmdFftHelper::setupData;
1382 bool AmdFftHelper::g_isAmdFftAvailable = false;
1383 bool AmdFftHelper::g_isAmdFftInitialized = false;
1384
1385 bool haveAmdFft()
1386 {
1387     return AmdFftHelper::getInstance().isAvailable();
1388 }
1389
1390 #else
1391
1392 bool haveAmdFft()
1393 {
1394     return false;
1395 }
1396
1397 #endif
1398
1399 bool haveSVM()
1400 {
1401 #ifdef HAVE_OPENCL_SVM
1402     return true;
1403 #else
1404     return false;
1405 #endif
1406 }
1407
1408 void finish()
1409 {
1410     Queue::getDefault().finish();
1411 }
1412
1413 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1414
1415 struct Platform::Impl
1416 {
1417     Impl()
1418     {
1419         refcount = 1;
1420         handle = 0;
1421         initialized = false;
1422     }
1423
1424     ~Impl() {}
1425
1426     void init()
1427     {
1428         if( !initialized )
1429         {
1430             //cl_uint num_entries
1431             cl_uint n = 0;
1432             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1433                 handle = 0;
1434             if( handle != 0 )
1435             {
1436                 char buf[1000];
1437                 size_t len = 0;
1438                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1439                 buf[len] = '\0';
1440                 vendor = String(buf);
1441             }
1442
1443             initialized = true;
1444         }
1445     }
1446
1447     IMPLEMENT_REFCOUNTABLE();
1448
1449     cl_platform_id handle;
1450     String vendor;
1451     bool initialized;
1452 };
1453
1454 Platform::Platform()
1455 {
1456     p = 0;
1457 }
1458
1459 Platform::~Platform()
1460 {
1461     if(p)
1462         p->release();
1463 }
1464
1465 Platform::Platform(const Platform& pl)
1466 {
1467     p = (Impl*)pl.p;
1468     if(p)
1469         p->addref();
1470 }
1471
1472 Platform& Platform::operator = (const Platform& pl)
1473 {
1474     Impl* newp = (Impl*)pl.p;
1475     if(newp)
1476         newp->addref();
1477     if(p)
1478         p->release();
1479     p = newp;
1480     return *this;
1481 }
1482
1483 void* Platform::ptr() const
1484 {
1485     return p ? p->handle : 0;
1486 }
1487
1488 Platform& Platform::getDefault()
1489 {
1490     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1491     static Platform p;
1492     if( !p.p )
1493     {
1494         p.p = new Impl;
1495         p.p->init();
1496     }
1497     return p;
1498 }
1499
1500 /////////////////////////////////////// Device ////////////////////////////////////////////
1501
1502 // deviceVersion has format
1503 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1504 // by specification
1505 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1506 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1507 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1508 {
1509     major = minor = 0;
1510     if (10 >= deviceVersion.length())
1511         return;
1512     const char *pstr = deviceVersion.c_str();
1513     if (0 != strncmp(pstr, "OpenCL ", 7))
1514         return;
1515     size_t ppos = deviceVersion.find('.', 7);
1516     if (String::npos == ppos)
1517         return;
1518     String temp = deviceVersion.substr(7, ppos - 7);
1519     major = atoi(temp.c_str());
1520     temp = deviceVersion.substr(ppos + 1);
1521     minor = atoi(temp.c_str());
1522 }
1523
1524 struct Device::Impl
1525 {
1526     Impl(void* d)
1527         : refcount(1)
1528         , handle(0)
1529     {
1530         try
1531         {
1532             cl_device_id device = (cl_device_id)d;
1533             _init(device);
1534             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1535         }
1536         catch (...)
1537         {
1538             throw;
1539         }
1540     }
1541
1542     void _init(cl_device_id d)
1543     {
1544         handle = (cl_device_id)d;
1545
1546         name_ = getStrProp(CL_DEVICE_NAME);
1547         version_ = getStrProp(CL_DEVICE_VERSION);
1548         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1549         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1550         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1551         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1552         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1553         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1554         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1555         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1556
1557         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1558         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1559
1560         size_t pos = 0;
1561         while (pos < extensions_.size())
1562         {
1563             size_t pos2 = extensions_.find(' ', pos);
1564             if (pos2 == String::npos)
1565                 pos2 = extensions_.size();
1566             if (pos2 > pos)
1567             {
1568                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1569                 extensions_set_.insert(extensionName);
1570             }
1571             pos = pos2 + 1;
1572         }
1573
1574         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1575
1576         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1577         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1578             vendorName_ == "AMD")
1579             vendorID_ = VENDOR_AMD;
1580         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1581             vendorID_ = VENDOR_INTEL;
1582         else if (vendorName_ == "NVIDIA Corporation")
1583             vendorID_ = VENDOR_NVIDIA;
1584         else
1585             vendorID_ = UNKNOWN_VENDOR;
1586
1587         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1588         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1589         {
1590             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1591             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1592                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1593             maxWorkGroupSize_ = new_maxWorkGroupSize;
1594         }
1595 #if 0
1596         if (isExtensionSupported("cl_khr_spir"))
1597         {
1598 #ifndef CL_DEVICE_SPIR_VERSIONS
1599 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1600 #endif
1601             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1602             std::cout << spir_versions << std::endl;
1603         }
1604 #endif
1605     }
1606
1607     ~Impl()
1608     {
1609 #ifdef _WIN32
1610         if (!cv::__termination)
1611 #endif
1612         {
1613             if (handle)
1614             {
1615                 CV_OCL_CHECK(clReleaseDevice(handle));
1616                 handle = 0;
1617             }
1618         }
1619     }
1620
1621     template<typename _TpCL, typename _TpOut>
1622     _TpOut getProp(cl_device_info prop) const
1623     {
1624         _TpCL temp=_TpCL();
1625         size_t sz = 0;
1626
1627         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1628             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1629     }
1630
1631     bool getBoolProp(cl_device_info prop) const
1632     {
1633         cl_bool temp = CL_FALSE;
1634         size_t sz = 0;
1635
1636         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1637             sz == sizeof(temp) ? temp != 0 : false;
1638     }
1639
1640     String getStrProp(cl_device_info prop) const
1641     {
1642         char buf[4096];
1643         size_t sz=0;
1644         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1645             sz < sizeof(buf) ? String(buf) : String();
1646     }
1647
1648     bool isExtensionSupported(const std::string& extensionName) const
1649     {
1650         return extensions_set_.count(extensionName) > 0;
1651     }
1652
1653
1654     IMPLEMENT_REFCOUNTABLE();
1655
1656     cl_device_id handle;
1657
1658     String name_;
1659     String version_;
1660     std::string extensions_;
1661     int doubleFPConfig_;
1662     bool hostUnifiedMemory_;
1663     int maxComputeUnits_;
1664     size_t maxWorkGroupSize_;
1665     int type_;
1666     int addressBits_;
1667     int deviceVersionMajor_;
1668     int deviceVersionMinor_;
1669     String driverVersion_;
1670     String vendorName_;
1671     int vendorID_;
1672     bool intelSubgroupsSupport_;
1673
1674     std::set<std::string> extensions_set_;
1675 };
1676
1677
1678 Device::Device()
1679 {
1680     p = 0;
1681 }
1682
1683 Device::Device(void* d)
1684 {
1685     p = 0;
1686     set(d);
1687 }
1688
1689 Device::Device(const Device& d)
1690 {
1691     p = d.p;
1692     if(p)
1693         p->addref();
1694 }
1695
1696 Device& Device::operator = (const Device& d)
1697 {
1698     Impl* newp = (Impl*)d.p;
1699     if(newp)
1700         newp->addref();
1701     if(p)
1702         p->release();
1703     p = newp;
1704     return *this;
1705 }
1706
1707 Device::~Device()
1708 {
1709     if(p)
1710         p->release();
1711 }
1712
1713 void Device::set(void* d)
1714 {
1715     if(p)
1716         p->release();
1717     p = new Impl(d);
1718     if (p->handle)
1719     {
1720         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1721     }
1722 }
1723
1724 Device Device::fromHandle(void* d)
1725 {
1726     Device device(d);
1727     return device;
1728 }
1729
1730 void* Device::ptr() const
1731 {
1732     return p ? p->handle : 0;
1733 }
1734
1735 String Device::name() const
1736 { return p ? p->name_ : String(); }
1737
1738 String Device::extensions() const
1739 { return p ? String(p->extensions_) : String(); }
1740
1741 bool Device::isExtensionSupported(const String& extensionName) const
1742 { return p ? p->isExtensionSupported(extensionName) : false; }
1743
1744 String Device::version() const
1745 { return p ? p->version_ : String(); }
1746
1747 String Device::vendorName() const
1748 { return p ? p->vendorName_ : String(); }
1749
1750 int Device::vendorID() const
1751 { return p ? p->vendorID_ : 0; }
1752
1753 String Device::OpenCL_C_Version() const
1754 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1755
1756 String Device::OpenCLVersion() const
1757 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1758
1759 int Device::deviceVersionMajor() const
1760 { return p ? p->deviceVersionMajor_ : 0; }
1761
1762 int Device::deviceVersionMinor() const
1763 { return p ? p->deviceVersionMinor_ : 0; }
1764
1765 String Device::driverVersion() const
1766 { return p ? p->driverVersion_ : String(); }
1767
1768 int Device::type() const
1769 { return p ? p->type_ : 0; }
1770
1771 int Device::addressBits() const
1772 { return p ? p->addressBits_ : 0; }
1773
1774 bool Device::available() const
1775 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1776
1777 bool Device::compilerAvailable() const
1778 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1779
1780 bool Device::linkerAvailable() const
1781 #ifdef CL_VERSION_1_2
1782 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1783 #else
1784 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1785 #endif
1786
1787 int Device::doubleFPConfig() const
1788 { return p ? p->doubleFPConfig_ : 0; }
1789
1790 int Device::singleFPConfig() const
1791 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1792
1793 int Device::halfFPConfig() const
1794 #ifdef CL_VERSION_1_2
1795 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1796 #else
1797 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1798 #endif
1799
1800 bool Device::endianLittle() const
1801 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1802
1803 bool Device::errorCorrectionSupport() const
1804 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1805
1806 int Device::executionCapabilities() const
1807 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1808
1809 size_t Device::globalMemCacheSize() const
1810 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1811
1812 int Device::globalMemCacheType() const
1813 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1814
1815 int Device::globalMemCacheLineSize() const
1816 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1817
1818 size_t Device::globalMemSize() const
1819 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1820
1821 size_t Device::localMemSize() const
1822 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1823
1824 int Device::localMemType() const
1825 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1826
1827 bool Device::hostUnifiedMemory() const
1828 { return p ? p->hostUnifiedMemory_ : false; }
1829
1830 bool Device::imageSupport() const
1831 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1832
1833 bool Device::imageFromBufferSupport() const
1834 {
1835     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1836 }
1837
1838 uint Device::imagePitchAlignment() const
1839 {
1840 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1841     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1842 #else
1843     return 0;
1844 #endif
1845 }
1846
1847 uint Device::imageBaseAddressAlignment() const
1848 {
1849 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1850     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1851 #else
1852     return 0;
1853 #endif
1854 }
1855
1856 size_t Device::image2DMaxWidth() const
1857 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1858
1859 size_t Device::image2DMaxHeight() const
1860 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1861
1862 size_t Device::image3DMaxWidth() const
1863 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1864
1865 size_t Device::image3DMaxHeight() const
1866 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1867
1868 size_t Device::image3DMaxDepth() const
1869 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1870
1871 size_t Device::imageMaxBufferSize() const
1872 #ifdef CL_VERSION_1_2
1873 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1874 #else
1875 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1876 #endif
1877
1878 size_t Device::imageMaxArraySize() const
1879 #ifdef CL_VERSION_1_2
1880 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1881 #else
1882 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1883 #endif
1884
1885 bool Device::intelSubgroupsSupport() const
1886 { return p ? p->intelSubgroupsSupport_ : false; }
1887
1888 int Device::maxClockFrequency() const
1889 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1890
1891 int Device::maxComputeUnits() const
1892 { return p ? p->maxComputeUnits_ : 0; }
1893
1894 int Device::maxConstantArgs() const
1895 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1896
1897 size_t Device::maxConstantBufferSize() const
1898 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1899
1900 size_t Device::maxMemAllocSize() const
1901 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1902
1903 size_t Device::maxParameterSize() const
1904 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1905
1906 int Device::maxReadImageArgs() const
1907 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1908
1909 int Device::maxWriteImageArgs() const
1910 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1911
1912 int Device::maxSamplers() const
1913 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1914
1915 size_t Device::maxWorkGroupSize() const
1916 { return p ? p->maxWorkGroupSize_ : 0; }
1917
1918 int Device::maxWorkItemDims() const
1919 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1920
1921 void Device::maxWorkItemSizes(size_t* sizes) const
1922 {
1923     if(p)
1924     {
1925         const int MAX_DIMS = 32;
1926         size_t retsz = 0;
1927         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1928                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1929     }
1930 }
1931
1932 int Device::memBaseAddrAlign() const
1933 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1934
1935 int Device::nativeVectorWidthChar() const
1936 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1937
1938 int Device::nativeVectorWidthShort() const
1939 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1940
1941 int Device::nativeVectorWidthInt() const
1942 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1943
1944 int Device::nativeVectorWidthLong() const
1945 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1946
1947 int Device::nativeVectorWidthFloat() const
1948 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1949
1950 int Device::nativeVectorWidthDouble() const
1951 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1952
1953 int Device::nativeVectorWidthHalf() const
1954 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1955
1956 int Device::preferredVectorWidthChar() const
1957 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1958
1959 int Device::preferredVectorWidthShort() const
1960 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1961
1962 int Device::preferredVectorWidthInt() const
1963 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1964
1965 int Device::preferredVectorWidthLong() const
1966 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1967
1968 int Device::preferredVectorWidthFloat() const
1969 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1970
1971 int Device::preferredVectorWidthDouble() const
1972 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1973
1974 int Device::preferredVectorWidthHalf() const
1975 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1976
1977 size_t Device::printfBufferSize() const
1978 #ifdef CL_VERSION_1_2
1979 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1980 #else
1981 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1982 #endif
1983
1984
1985 size_t Device::profilingTimerResolution() const
1986 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1987
1988 const Device& Device::getDefault()
1989 {
1990     auto& c = OpenCLExecutionContext::getCurrent();
1991     if (!c.empty())
1992     {
1993         return c.getDevice();
1994     }
1995
1996     static Device dummy;
1997     return dummy;
1998 }
1999
2000 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2001
2002 template <typename Functor, typename ObjectType>
2003 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2004 {
2005     ::size_t required;
2006     cl_int err = f(obj, name, 0, NULL, &required);
2007     if (err != CL_SUCCESS)
2008         return err;
2009
2010     param.clear();
2011     if (required > 0)
2012     {
2013         AutoBuffer<char> buf(required + 1);
2014         char* ptr = buf.data(); // cleanup is not needed
2015         err = f(obj, name, required, ptr, NULL);
2016         if (err != CL_SUCCESS)
2017             return err;
2018         param = ptr;
2019     }
2020
2021     return CL_SUCCESS;
2022 }
2023
2024 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2025 {
2026     elems.clear();
2027     if (s.size() == 0)
2028         return;
2029     std::istringstream ss(s);
2030     std::string item;
2031     while (!ss.eof())
2032     {
2033         std::getline(ss, item, delim);
2034         elems.push_back(item);
2035     }
2036 }
2037
2038 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2039 // Sample: AMD:GPU:
2040 // Sample: AMD:GPU:Tahiti
2041 // Sample: :GPU|CPU: = '' = ':' = '::'
2042 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2043         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2044 {
2045     std::vector<std::string> parts;
2046     split(configurationStr, ':', parts);
2047     if (parts.size() > 3)
2048     {
2049         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2050         return false;
2051     }
2052     if (parts.size() > 2)
2053         deviceNameOrID = parts[2];
2054     if (parts.size() > 1)
2055     {
2056         split(parts[1], '|', deviceTypes);
2057     }
2058     if (parts.size() > 0)
2059     {
2060         platform = parts[0];
2061     }
2062     return true;
2063 }
2064
2065 #if defined WINRT || defined _WIN32_WCE
2066 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2067 {
2068     CV_UNUSED(configuration)
2069     return NULL;
2070 }
2071 #else
2072 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2073 {
2074     std::string platform, deviceName;
2075     std::vector<std::string> deviceTypes;
2076
2077     if (!configuration)
2078         configuration = getenv("OPENCV_OPENCL_DEVICE");
2079
2080     if (configuration &&
2081             (strcmp(configuration, "disabled") == 0 ||
2082              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2083             ))
2084         return NULL;
2085
2086     bool isID = false;
2087     int deviceID = -1;
2088     if (deviceName.length() == 1)
2089     // We limit ID range to 0..9, because we want to write:
2090     // - '2500' to mean i5-2500
2091     // - '8350' to mean AMD FX-8350
2092     // - '650' to mean GeForce 650
2093     // To extend ID range change condition to '> 0'
2094     {
2095         isID = true;
2096         for (size_t i = 0; i < deviceName.length(); i++)
2097         {
2098             if (!isdigit(deviceName[i]))
2099             {
2100                 isID = false;
2101                 break;
2102             }
2103         }
2104         if (isID)
2105         {
2106             deviceID = atoi(deviceName.c_str());
2107             if (deviceID < 0)
2108                 return NULL;
2109         }
2110     }
2111
2112     std::vector<cl_platform_id> platforms;
2113     {
2114         cl_uint numPlatforms = 0;
2115         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2116
2117         if (numPlatforms == 0)
2118             return NULL;
2119         platforms.resize((size_t)numPlatforms);
2120         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2121         platforms.resize(numPlatforms);
2122     }
2123
2124     int selectedPlatform = -1;
2125     if (platform.length() > 0)
2126     {
2127         for (size_t i = 0; i < platforms.size(); i++)
2128         {
2129             std::string name;
2130             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2131             if (name.find(platform) != std::string::npos)
2132             {
2133                 selectedPlatform = (int)i;
2134                 break;
2135             }
2136         }
2137         if (selectedPlatform == -1)
2138         {
2139             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2140             goto not_found;
2141         }
2142     }
2143     if (deviceTypes.size() == 0)
2144     {
2145         if (!isID)
2146         {
2147             deviceTypes.push_back("GPU");
2148             if (configuration)
2149                 deviceTypes.push_back("CPU");
2150         }
2151         else
2152             deviceTypes.push_back("ALL");
2153     }
2154     for (size_t t = 0; t < deviceTypes.size(); t++)
2155     {
2156         int deviceType = 0;
2157         std::string tempStrDeviceType = deviceTypes[t];
2158         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2159
2160         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2161             deviceType = Device::TYPE_GPU;
2162         else if (tempStrDeviceType == "cpu")
2163             deviceType = Device::TYPE_CPU;
2164         else if (tempStrDeviceType == "accelerator")
2165             deviceType = Device::TYPE_ACCELERATOR;
2166         else if (tempStrDeviceType == "all")
2167             deviceType = Device::TYPE_ALL;
2168         else
2169         {
2170             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2171             goto not_found;
2172         }
2173
2174         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2175         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2176                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2177                 i++)
2178         {
2179             cl_uint count = 0;
2180             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2181             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2182             {
2183                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2184             }
2185             if (count == 0)
2186                 continue;
2187             size_t base = devices.size();
2188             devices.resize(base + count);
2189             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2190             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2191             {
2192                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2193             }
2194         }
2195
2196         for (size_t i = (isID ? deviceID : 0);
2197              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2198              i++)
2199         {
2200             std::string name;
2201             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2202             cl_bool useGPU = true;
2203             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2204             {
2205                 cl_bool isIGPU = CL_FALSE;
2206                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2207                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2208             }
2209             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2210             {
2211                 // TODO check for OpenCL 1.1
2212                 return devices[i];
2213             }
2214         }
2215     }
2216
2217 not_found:
2218     if (!configuration)
2219         return NULL; // suppress messages on stderr
2220
2221     std::ostringstream msg;
2222     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2223         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2224         << "    Device types:";
2225     for (size_t t = 0; t < deviceTypes.size(); t++)
2226         msg << ' ' << deviceTypes[t];
2227
2228     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2229
2230     CV_LOG_ERROR(NULL, msg.str());
2231     return NULL;
2232 }
2233 #endif
2234
2235 #ifdef HAVE_OPENCL_SVM
2236 namespace svm {
2237
2238 enum AllocatorFlags { // don't use first 16 bits
2239         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2240         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2241         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2242         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2243         OPENCL_SVM_BUFFER_MAP = 4 << 16
2244 };
2245
2246 static bool checkForceSVMUmatUsage()
2247 {
2248     static bool initialized = false;
2249     static bool force = false;
2250     if (!initialized)
2251     {
2252         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2253         initialized = true;
2254     }
2255     return force;
2256 }
2257 static bool checkDisableSVMUMatUsage()
2258 {
2259     static bool initialized = false;
2260     static bool force = false;
2261     if (!initialized)
2262     {
2263         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2264         initialized = true;
2265     }
2266     return force;
2267 }
2268 static bool checkDisableSVM()
2269 {
2270     static bool initialized = false;
2271     static bool force = false;
2272     if (!initialized)
2273     {
2274         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2275         initialized = true;
2276     }
2277     return force;
2278 }
2279 // see SVMCapabilities
2280 static unsigned int getSVMCapabilitiesMask()
2281 {
2282     static bool initialized = false;
2283     static unsigned int mask = 0;
2284     if (!initialized)
2285     {
2286         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2287         if (envValue == NULL)
2288         {
2289             return ~0U; // all bits 1
2290         }
2291         mask = atoi(envValue);
2292         initialized = true;
2293     }
2294     return mask;
2295 }
2296 } // namespace
2297 #endif
2298
2299 static size_t getProgramCountLimit()
2300 {
2301     static bool initialized = false;
2302     static size_t count = 0;
2303     if (!initialized)
2304     {
2305         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2306         initialized = true;
2307     }
2308     return count;
2309 }
2310
2311 static int g_contextId = 0;
2312
2313 class OpenCLBufferPoolImpl;
2314 class OpenCLSVMBufferPoolImpl;
2315
2316 struct Context::Impl
2317 {
2318     static Context::Impl* get(Context& context) { return context.p; }
2319
2320     typedef std::deque<Context::Impl*> container_t;
2321     static container_t& getGlobalContainer()
2322     {
2323         // never delete this container (Impl lifetime is greater due to TLS storage)
2324         static container_t* g_contexts = new container_t();
2325         return *g_contexts;
2326     }
2327
2328 protected:
2329     Impl(const std::string& configuration_)
2330         : refcount(1)
2331         , contextId(CV_XADD(&g_contextId, 1))
2332         , configuration(configuration_)
2333         , handle(0)
2334 #ifdef HAVE_DIRECTX
2335         , p_directx_impl(0)
2336 #endif
2337 #ifdef HAVE_OPENCL_SVM
2338         , svmInitialized(false)
2339 #endif
2340     {
2341         if (!haveOpenCL())
2342             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2343
2344         cv::AutoLock lock(cv::getInitializationMutex());
2345         auto& container = getGlobalContainer();
2346         container.resize(std::max(container.size(), (size_t)contextId + 1));
2347         container[contextId] = this;
2348     }
2349
2350     ~Impl()
2351     {
2352 #ifdef _WIN32
2353         if (!cv::__termination)
2354 #endif
2355         {
2356             if (handle)
2357             {
2358                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2359                 handle = NULL;
2360             }
2361             devices.clear();
2362 #ifdef HAVE_DIRECTX
2363             directx::internal::deleteDirectXImpl(&p_directx_impl);
2364 #endif
2365         }
2366
2367         {
2368             cv::AutoLock lock(cv::getInitializationMutex());
2369             auto& container = getGlobalContainer();
2370             CV_CheckLT((size_t)contextId, container.size(), "");
2371             container[contextId] = NULL;
2372         }
2373     }
2374
2375     void init_device_list()
2376     {
2377         CV_Assert(handle);
2378
2379         cl_uint ndevices = 0;
2380         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2381         CV_Assert(ndevices > 0);
2382
2383         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2384         size_t devices_ret_size = 0;
2385         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2386         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2387
2388         devices.clear();
2389         for (unsigned i = 0; i < ndevices; i++)
2390         {
2391             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2392         }
2393     }
2394
2395     void __init_buffer_pools();  // w/o synchronization
2396     void _init_buffer_pools() const
2397     {
2398         if (!bufferPool_)
2399         {
2400             cv::AutoLock lock(cv::getInitializationMutex());
2401             if (!bufferPool_)
2402             {
2403                 const_cast<Impl*>(this)->__init_buffer_pools();
2404             }
2405         }
2406     }
2407 public:
2408     static Impl* findContext(const std::string& configuration)
2409     {
2410         CV_TRACE_FUNCTION();
2411         cv::AutoLock lock(cv::getInitializationMutex());
2412         auto& container = getGlobalContainer();
2413         if (configuration.empty() && !container.empty())
2414             return container[0];
2415         for (auto it = container.begin(); it != container.end(); ++it)
2416         {
2417             Impl* i = *it;
2418             if (i && i->configuration == configuration)
2419             {
2420                 return i;
2421             }
2422         }
2423         return NULL;
2424     }
2425
2426     static Impl* findOrCreateContext(const std::string& configuration_)
2427     {
2428         CV_TRACE_FUNCTION();
2429         std::string configuration = configuration_;
2430         if (configuration_.empty())
2431         {
2432             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2433             if (c)
2434                 configuration = c;
2435         }
2436         Impl* impl = findContext(configuration);
2437         if (impl)
2438         {
2439             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2440             impl->addref();
2441             return impl;
2442         }
2443
2444         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2445         if (d == NULL)
2446             return NULL;
2447
2448         impl = new Impl(configuration);
2449         try
2450         {
2451             impl->createFromDevice(d);
2452             if (impl->handle)
2453                 return impl;
2454             delete impl;
2455             return NULL;
2456         }
2457         catch (...)
2458         {
2459             delete impl;
2460             throw;
2461         }
2462     }
2463
2464     static Impl* findOrCreateContext(cl_context h)
2465     {
2466         CV_TRACE_FUNCTION();
2467
2468         CV_Assert(h);
2469
2470         std::string configuration = cv::format("@ctx-%p", (void*)h);
2471         Impl* impl = findContext(configuration);
2472         if (impl)
2473         {
2474             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2475             impl->addref();
2476             return impl;
2477         }
2478
2479         impl = new Impl(configuration);
2480         try
2481         {
2482             CV_OCL_CHECK(clRetainContext(h));
2483             impl->handle = h;
2484             impl->init_device_list();
2485             return impl;
2486         }
2487         catch (...)
2488         {
2489             delete impl;
2490             throw;
2491         }
2492     }
2493
2494     static Impl* findOrCreateContext(const ocl::Device& device)
2495     {
2496         CV_TRACE_FUNCTION();
2497
2498         CV_Assert(!device.empty());
2499         cl_device_id d = (cl_device_id)device.ptr();
2500         CV_Assert(d);
2501
2502         std::string configuration = cv::format("@dev-%p", (void*)d);
2503         Impl* impl = findContext(configuration);
2504         if (impl)
2505         {
2506             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2507             impl->addref();
2508             return impl;
2509         }
2510
2511         impl = new Impl(configuration);
2512         try
2513         {
2514             impl->createFromDevice(d);
2515             CV_Assert(impl->handle);
2516             return impl;
2517         }
2518         catch (...)
2519         {
2520             delete impl;
2521             throw;
2522         }
2523     }
2524
2525     void setDefault()
2526     {
2527         CV_TRACE_FUNCTION();
2528         cl_device_id d = selectOpenCLDevice();
2529
2530         if (d == NULL)
2531             return;
2532
2533         createFromDevice(d);
2534     }
2535
2536     void createFromDevice(cl_device_id d)
2537     {
2538         CV_TRACE_FUNCTION();
2539         CV_Assert(handle == NULL);
2540
2541         cl_platform_id pl = NULL;
2542         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2543
2544         cl_context_properties prop[] =
2545         {
2546             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2547             0
2548         };
2549
2550         // !!! in the current implementation force the number of devices to 1 !!!
2551         cl_uint nd = 1;
2552         cl_int status;
2553
2554         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2555         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2556
2557         bool ok = handle != 0 && status == CL_SUCCESS;
2558         if( ok )
2559         {
2560             devices.resize(nd);
2561             devices[0].set(d);
2562         }
2563         else
2564             handle = NULL;
2565     }
2566
2567     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2568
2569     void unloadProg(Program& prog)
2570     {
2571         cv::AutoLock lock(program_cache_mutex);
2572         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2573         {
2574               phash_t::iterator it = phash.find(*i);
2575               if (it != phash.end())
2576               {
2577                   if (it->second.ptr() == prog.ptr())
2578                   {
2579                       phash.erase(*i);
2580                       cacheList.erase(i);
2581                       return;
2582                   }
2583               }
2584         }
2585     }
2586
2587     std::string& getPrefixString()
2588     {
2589         if (prefix.empty())
2590         {
2591             cv::AutoLock lock(program_cache_mutex);
2592             if (prefix.empty())
2593             {
2594                 CV_Assert(!devices.empty());
2595                 const Device& d = devices[0];
2596                 int bits = d.addressBits();
2597                 if (bits > 0 && bits != 64)
2598                     prefix = cv::format("%d-bit--", bits);
2599                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2600                 // sanitize chars
2601                 for (size_t i = 0; i < prefix.size(); i++)
2602                 {
2603                     char c = prefix[i];
2604                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2605                     {
2606                         prefix[i] = '_';
2607                     }
2608                 }
2609             }
2610         }
2611         return prefix;
2612     }
2613
2614     std::string& getPrefixBase()
2615     {
2616         if (prefix_base.empty())
2617         {
2618             cv::AutoLock lock(program_cache_mutex);
2619             if (prefix_base.empty())
2620             {
2621                 const Device& d = devices[0];
2622                 int bits = d.addressBits();
2623                 if (bits > 0 && bits != 64)
2624                     prefix_base = cv::format("%d-bit--", bits);
2625                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2626                 // sanitize chars
2627                 for (size_t i = 0; i < prefix_base.size(); i++)
2628                 {
2629                     char c = prefix_base[i];
2630                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2631                     {
2632                         prefix_base[i] = '_';
2633                     }
2634                 }
2635             }
2636         }
2637         return prefix_base;
2638     }
2639
2640     IMPLEMENT_REFCOUNTABLE();
2641
2642     const int contextId;  // global unique ID
2643     const std::string configuration;
2644
2645     cl_context handle;
2646     std::vector<Device> devices;
2647
2648     std::string prefix;
2649     std::string prefix_base;
2650
2651     cv::Mutex program_cache_mutex;
2652     typedef std::map<std::string, Program> phash_t;
2653     phash_t phash;
2654     typedef std::list<cv::String> CacheList;
2655     CacheList cacheList;
2656
2657     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2658     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2659     OpenCLBufferPoolImpl& getBufferPool() const
2660     {
2661         _init_buffer_pools();
2662         CV_DbgAssert(bufferPool_);
2663         return *bufferPool_.get();
2664     }
2665     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2666     {
2667         _init_buffer_pools();
2668         CV_DbgAssert(bufferPoolHostPtr_);
2669         return *bufferPoolHostPtr_.get();
2670     }
2671
2672 #ifdef HAVE_DIRECTX
2673     directx::internal::OpenCLDirectXImpl* p_directx_impl;
2674
2675     directx::internal::OpenCLDirectXImpl* getDirectXImpl()
2676     {
2677         if (!p_directx_impl)
2678         {
2679             p_directx_impl = directx::internal::createDirectXImpl();
2680         }
2681         return p_directx_impl;
2682     }
2683 #endif
2684
2685 #ifdef HAVE_OPENCL_SVM
2686     bool svmInitialized;
2687     bool svmAvailable;
2688     bool svmEnabled;
2689     svm::SVMCapabilities svmCapabilities;
2690     svm::SVMFunctions svmFunctions;
2691
2692     void svmInit()
2693     {
2694         CV_Assert(handle != NULL);
2695         const Device& device = devices[0];
2696         cl_device_svm_capabilities deviceCaps = 0;
2697         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2698         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2699         if (status != CL_SUCCESS)
2700         {
2701             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2702             goto noSVM;
2703         }
2704         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2705         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2706         svmCapabilities.value_ =
2707                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2708                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2709                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2710                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2711         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2712         if (svmCapabilities.value_ == 0)
2713         {
2714             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2715             goto noSVM;
2716         }
2717         try
2718         {
2719             // Try OpenCL 2.0
2720             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2721             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2722             if (!ptr)
2723             {
2724                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2725                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2726             }
2727             try
2728             {
2729                 bool error = false;
2730                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2731                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2732                 {
2733                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2734                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2735                 }
2736                 clFinish(q);
2737                 try
2738                 {
2739                     ((int*)ptr)[0] = 100;
2740                 }
2741                 catch (...)
2742                 {
2743                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2744                     error = true;
2745                 }
2746                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2747                 {
2748                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2749                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2750                 }
2751                 clFinish(q);
2752                 if (error)
2753                 {
2754                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2755                 }
2756             }
2757             catch (...)
2758             {
2759                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2760                 clSVMFree(handle, ptr);
2761                 throw;
2762             }
2763             clSVMFree(handle, ptr);
2764             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2765             svmFunctions.fn_clSVMFree = clSVMFree;
2766             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2767             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2768             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2769             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2770             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2771             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2772             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2773         }
2774         catch (...)
2775         {
2776             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2777             try
2778             {
2779                 // Try HSA extension
2780                 String extensions = device.extensions();
2781                 if (extensions.find("cl_amd_svm") == String::npos)
2782                 {
2783                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2784                     goto noSVM;
2785                 }
2786                 cl_platform_id p = NULL;
2787                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2788                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2789                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2790                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2791                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2792                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2793                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2794                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2795                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2796                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2797                 CV_Assert(svmFunctions.isValid());
2798             }
2799             catch (...)
2800             {
2801                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2802                 goto noSVM;
2803             }
2804         }
2805
2806         svmAvailable = true;
2807         svmEnabled = !svm::checkDisableSVM();
2808         svmInitialized = true;
2809         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2810         return;
2811     noSVM:
2812         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2813         svmAvailable = false;
2814         svmEnabled = false;
2815         svmCapabilities.value_ = 0;
2816         svmInitialized = true;
2817         svmFunctions.fn_clSVMAlloc = NULL;
2818         return;
2819     }
2820
2821     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2822
2823     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2824     {
2825         _init_buffer_pools();
2826         CV_DbgAssert(bufferPoolSVM_);
2827         return *bufferPoolSVM_.get();
2828     }
2829 #endif
2830
2831     friend class Program;
2832 };
2833
2834
2835 Context::Context()
2836 {
2837     p = 0;
2838 }
2839
2840 Context::~Context()
2841 {
2842     release();
2843 }
2844
2845 // deprecated
2846 Context::Context(int dtype)
2847 {
2848     p = 0;
2849     create(dtype);
2850 }
2851
2852 void Context::release()
2853 {
2854     if (p)
2855     {
2856         p->release();
2857         p = NULL;
2858     }
2859 }
2860
2861 bool Context::create()
2862 {
2863     release();
2864     if (!haveOpenCL())
2865         return false;
2866     p = Impl::findOrCreateContext(std::string());
2867     if (p && p->handle)
2868         return true;
2869     release();
2870     return false;
2871 }
2872
2873 // deprecated
2874 bool Context::create(int dtype)
2875 {
2876     if( !haveOpenCL() )
2877         return false;
2878     release();
2879     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2880     {
2881         p = Impl::findOrCreateContext("");
2882     }
2883     else if (dtype == CL_DEVICE_TYPE_GPU)
2884     {
2885         p = Impl::findOrCreateContext(":GPU:");
2886     }
2887     else if (dtype == CL_DEVICE_TYPE_CPU)
2888     {
2889         p = Impl::findOrCreateContext(":CPU:");
2890     }
2891     else
2892     {
2893         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2894     }
2895     if (p && !p->handle)
2896     {
2897         release();
2898     }
2899     return p != 0;
2900 }
2901
2902 Context::Context(const Context& c)
2903 {
2904     p = (Impl*)c.p;
2905     if(p)
2906         p->addref();
2907 }
2908
2909 Context& Context::operator = (const Context& c)
2910 {
2911     Impl* newp = (Impl*)c.p;
2912     if(newp)
2913         newp->addref();
2914     if(p)
2915         p->release();
2916     p = newp;
2917     return *this;
2918 }
2919
2920 void* Context::ptr() const
2921 {
2922     return p == NULL ? NULL : p->handle;
2923 }
2924
2925 size_t Context::ndevices() const
2926 {
2927     return p ? p->devices.size() : 0;
2928 }
2929
2930 Device& Context::device(size_t idx) const
2931 {
2932     static Device dummy;
2933     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2934 }
2935
2936 Context& Context::getDefault(bool initialize)
2937 {
2938     auto& c = OpenCLExecutionContext::getCurrent();
2939     if (!c.empty())
2940     {
2941         auto& ctx = c.getContext();
2942         return ctx;
2943     }
2944
2945     CV_UNUSED(initialize);
2946     static Context dummy;
2947     return dummy;
2948 }
2949
2950 Program Context::getProg(const ProgramSource& prog,
2951                          const String& buildopts, String& errmsg)
2952 {
2953     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2954 }
2955
2956 void Context::unloadProg(Program& prog)
2957 {
2958     if (p)
2959         p->unloadProg(prog);
2960 }
2961
2962 /* static */
2963 Context Context::fromHandle(void* context)
2964 {
2965     Context ctx;
2966     ctx.p = Impl::findOrCreateContext((cl_context)context);
2967     return ctx;
2968 }
2969
2970 /* static */
2971 Context Context::fromDevice(const ocl::Device& device)
2972 {
2973     Context ctx;
2974     ctx.p = Impl::findOrCreateContext(device);
2975     return ctx;
2976 }
2977
2978 /* static */
2979 Context Context::create(const std::string& configuration)
2980 {
2981     Context ctx;
2982     ctx.p = Impl::findOrCreateContext(configuration);
2983     return ctx;
2984 }
2985
2986 #ifdef HAVE_OPENCL_SVM
2987 bool Context::useSVM() const
2988 {
2989     Context::Impl* i = p;
2990     CV_Assert(i);
2991     if (!i->svmInitialized)
2992         i->svmInit();
2993     return i->svmEnabled;
2994 }
2995 void Context::setUseSVM(bool enabled)
2996 {
2997     Context::Impl* i = p;
2998     CV_Assert(i);
2999     if (!i->svmInitialized)
3000         i->svmInit();
3001     if (enabled && !i->svmAvailable)
3002     {
3003         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
3004     }
3005     i->svmEnabled = enabled;
3006 }
3007 #else
3008 bool Context::useSVM() const { return false; }
3009 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
3010 #endif
3011
3012 #ifdef HAVE_OPENCL_SVM
3013 namespace svm {
3014
3015 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
3016 {
3017     Context::Impl* i = context.p;
3018     CV_Assert(i);
3019     if (!i->svmInitialized)
3020         i->svmInit();
3021     return i->svmCapabilities;
3022 }
3023
3024 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3025 {
3026     Context::Impl* i = context.p;
3027     CV_Assert(i);
3028     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3029     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3030     return &i->svmFunctions;
3031 }
3032
3033 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3034 {
3035     if (checkForceSVMUmatUsage())
3036         return true;
3037     if (checkDisableSVMUMatUsage())
3038         return false;
3039     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3040         return true;
3041     return false; // don't use SVM by default
3042 }
3043
3044 } // namespace cv::ocl::svm
3045 #endif // HAVE_OPENCL_SVM
3046
3047
3048 static void get_platform_name(cl_platform_id id, String& name)
3049 {
3050     // get platform name string length
3051     size_t sz = 0;
3052     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3053
3054     // get platform name string
3055     AutoBuffer<char> buf(sz + 1);
3056     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3057
3058     // just in case, ensure trailing zero for ASCIIZ string
3059     buf[sz] = 0;
3060
3061     name = buf.data();
3062 }
3063
3064 /*
3065 // Attaches OpenCL context to OpenCV
3066 */
3067 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3068 {
3069     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3070     ctx.bind();
3071 }
3072
3073 /* static */
3074 OpenCLExecutionContext OpenCLExecutionContext::create(
3075         const std::string& platformName, void* platformID, void* context, void* deviceID
3076 )
3077 {
3078     if (!haveOpenCL())
3079         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3080
3081     cl_uint cnt = 0;
3082     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3083
3084     if (cnt == 0)
3085         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3086
3087     std::vector<cl_platform_id> platforms(cnt);
3088
3089     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3090
3091     bool platformAvailable = false;
3092
3093     // check if external platformName contained in list of available platforms in OpenCV
3094     for (unsigned int i = 0; i < cnt; i++)
3095     {
3096         String availablePlatformName;
3097         get_platform_name(platforms[i], availablePlatformName);
3098         // external platform is found in the list of available platforms
3099         if (platformName == availablePlatformName)
3100         {
3101             platformAvailable = true;
3102             break;
3103         }
3104     }
3105
3106     if (!platformAvailable)
3107         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3108
3109     // check if platformID corresponds to platformName
3110     String actualPlatformName;
3111     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3112     if (platformName != actualPlatformName)
3113         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3114
3115     OpenCLExecutionContext ctx;
3116     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3117     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3118     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3119     return ctx;
3120 }
3121
3122 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3123 {
3124     // internal call, less checks
3125     cl_platform_id platformID = (cl_platform_id)_platform;
3126     cl_context context = (cl_context)_context;
3127     cl_device_id deviceID = (cl_device_id)_device;
3128
3129     std::string platformName = PlatformInfo(&platformID).name();
3130
3131     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3132     CV_Assert(!clExecCtx.empty());
3133     ctx = clExecCtx.getContext();
3134 }
3135
3136 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3137
3138 struct Queue::Impl
3139 {
3140     inline void __init()
3141     {
3142         refcount = 1;
3143         handle = 0;
3144         isProfilingQueue_ = false;
3145     }
3146
3147     Impl(cl_command_queue q)
3148     {
3149         __init();
3150         handle = q;
3151
3152         cl_command_queue_properties props = 0;
3153         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3154         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3155     }
3156
3157     Impl(cl_command_queue q, bool isProfilingQueue)
3158     {
3159         __init();
3160         handle = q;
3161         isProfilingQueue_ = isProfilingQueue;
3162     }
3163
3164     Impl(const Context& c, const Device& d, bool withProfiling = false)
3165     {
3166         __init();
3167
3168         const Context* pc = &c;
3169         cl_context ch = (cl_context)pc->ptr();
3170         if( !ch )
3171         {
3172             pc = &Context::getDefault();
3173             ch = (cl_context)pc->ptr();
3174         }
3175         cl_device_id dh = (cl_device_id)d.ptr();
3176         if( !dh )
3177             dh = (cl_device_id)pc->device(0).ptr();
3178         cl_int retval = 0;
3179         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3180         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3181         isProfilingQueue_ = withProfiling;
3182     }
3183
3184     ~Impl()
3185     {
3186 #ifdef _WIN32
3187         if (!cv::__termination)
3188 #endif
3189         {
3190             if(handle)
3191             {
3192                 CV_OCL_DBG_CHECK(clFinish(handle));
3193                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3194                 handle = NULL;
3195             }
3196         }
3197     }
3198
3199     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3200     {
3201         if (isProfilingQueue_)
3202             return self;
3203
3204         if (profiling_queue_.ptr())
3205             return profiling_queue_;
3206
3207         cl_context ctx = 0;
3208         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3209
3210         cl_device_id device = 0;
3211         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3212
3213         cl_int result = CL_SUCCESS;
3214         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3215         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3216         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3217
3218         Queue queue;
3219         queue.p = new Impl(q, true);
3220         profiling_queue_ = queue;
3221
3222         return profiling_queue_;
3223     }
3224
3225     IMPLEMENT_REFCOUNTABLE();
3226
3227     cl_command_queue handle;
3228     bool isProfilingQueue_;
3229     cv::ocl::Queue profiling_queue_;
3230 };
3231
3232 Queue::Queue()
3233 {
3234     p = 0;
3235 }
3236
3237 Queue::Queue(const Context& c, const Device& d)
3238 {
3239     p = 0;
3240     create(c, d);
3241 }
3242
3243 Queue::Queue(const Queue& q)
3244 {
3245     p = q.p;
3246     if(p)
3247         p->addref();
3248 }
3249
3250 Queue& Queue::operator = (const Queue& q)
3251 {
3252     Impl* newp = (Impl*)q.p;
3253     if(newp)
3254         newp->addref();
3255     if(p)
3256         p->release();
3257     p = newp;
3258     return *this;
3259 }
3260
3261 Queue::~Queue()
3262 {
3263     if(p)
3264         p->release();
3265 }
3266
3267 bool Queue::create(const Context& c, const Device& d)
3268 {
3269     if(p)
3270         p->release();
3271     p = new Impl(c, d);
3272     return p->handle != 0;
3273 }
3274
3275 void Queue::finish()
3276 {
3277     if(p && p->handle)
3278     {
3279         CV_OCL_DBG_CHECK(clFinish(p->handle));
3280     }
3281 }
3282
3283 const Queue& Queue::getProfilingQueue() const
3284 {
3285     CV_Assert(p);
3286     return p->getProfilingQueue(*this);
3287 }
3288
3289 void* Queue::ptr() const
3290 {
3291     return p ? p->handle : 0;
3292 }
3293
3294 Queue& Queue::getDefault()
3295 {
3296     auto& c = OpenCLExecutionContext::getCurrent();
3297     if (!c.empty())
3298     {
3299         auto& q = c.getQueue();
3300         return q;
3301     }
3302     static Queue dummy;
3303     return dummy;
3304 }
3305
3306 static cl_command_queue getQueue(const Queue& q)
3307 {
3308     cl_command_queue qq = (cl_command_queue)q.ptr();
3309     if(!qq)
3310         qq = (cl_command_queue)Queue::getDefault().ptr();
3311     return qq;
3312 }
3313
3314 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3315
3316 KernelArg::KernelArg()
3317     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3318 {
3319 }
3320
3321 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3322     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3323 {
3324     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3325 }
3326
3327 KernelArg KernelArg::Constant(const Mat& m)
3328 {
3329     CV_Assert(m.isContinuous());
3330     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3331 }
3332
3333 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3334
3335 struct Kernel::Impl
3336 {
3337     Impl(const char* kname, const Program& prog) :
3338         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3339     {
3340         cl_program ph = (cl_program)prog.ptr();
3341         cl_int retval = 0;
3342         name = kname;
3343         if (ph)
3344         {
3345             handle = clCreateKernel(ph, kname, &retval);
3346             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3347         }
3348         for( int i = 0; i < MAX_ARRS; i++ )
3349             u[i] = 0;
3350         haveTempDstUMats = false;
3351         haveTempSrcUMats = false;
3352     }
3353
3354     void cleanupUMats()
3355     {
3356         for( int i = 0; i < MAX_ARRS; i++ )
3357             if( u[i] )
3358             {
3359                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3360                 {
3361                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3362                     u[i]->currAllocator->deallocate(u[i]);
3363                 }
3364                 u[i] = 0;
3365             }
3366         nu = 0;
3367         haveTempDstUMats = false;
3368         haveTempSrcUMats = false;
3369     }
3370
3371     void addUMat(const UMat& m, bool dst)
3372     {
3373         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3374         u[nu] = m.u;
3375         CV_XADD(&m.u->urefcount, 1);
3376         nu++;
3377         if(dst && m.u->tempUMat())
3378             haveTempDstUMats = true;
3379         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3380             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3381     }
3382
3383     void addImage(const Image2D& image)
3384     {
3385         images.push_back(image);
3386     }
3387
3388     void finit(cl_event e)
3389     {
3390         CV_UNUSED(e);
3391         cleanupUMats();
3392         images.clear();
3393         isInProgress = false;
3394         release();
3395     }
3396
3397     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3398             bool sync, int64* timeNS, const Queue& q);
3399
3400     ~Impl()
3401     {
3402         if(handle)
3403         {
3404             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3405         }
3406     }
3407
3408     IMPLEMENT_REFCOUNTABLE();
3409
3410     cv::String name;
3411     cl_kernel handle;
3412     enum { MAX_ARRS = 16 };
3413     UMatData* u[MAX_ARRS];
3414     bool isInProgress;
3415     bool isAsyncRun;  // true if kernel was scheduled in async mode
3416     int nu;
3417     std::list<Image2D> images;
3418     bool haveTempDstUMats;
3419     bool haveTempSrcUMats;
3420 };
3421
3422 }} // namespace cv::ocl
3423
3424 extern "C" {
3425
3426 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3427 {
3428     try
3429     {
3430         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3431     }
3432     catch (const cv::Exception& exc)
3433     {
3434         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3435     }
3436     catch (const std::exception& exc)
3437     {
3438         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3439     }
3440     catch (...)
3441     {
3442         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3443     }
3444 }
3445
3446 }
3447
3448 namespace cv { namespace ocl {
3449
3450 Kernel::Kernel()
3451 {
3452     p = 0;
3453 }
3454
3455 Kernel::Kernel(const char* kname, const Program& prog)
3456 {
3457     p = 0;
3458     create(kname, prog);
3459 }
3460
3461 Kernel::Kernel(const char* kname, const ProgramSource& src,
3462                const String& buildopts, String* errmsg)
3463 {
3464     p = 0;
3465     create(kname, src, buildopts, errmsg);
3466 }
3467
3468 Kernel::Kernel(const Kernel& k)
3469 {
3470     p = k.p;
3471     if(p)
3472         p->addref();
3473 }
3474
3475 Kernel& Kernel::operator = (const Kernel& k)
3476 {
3477     Impl* newp = (Impl*)k.p;
3478     if(newp)
3479         newp->addref();
3480     if(p)
3481         p->release();
3482     p = newp;
3483     return *this;
3484 }
3485
3486 Kernel::~Kernel()
3487 {
3488     if(p)
3489         p->release();
3490 }
3491
3492 bool Kernel::create(const char* kname, const Program& prog)
3493 {
3494     if(p)
3495         p->release();
3496     p = new Impl(kname, prog);
3497     if(p->handle == 0)
3498     {
3499         p->release();
3500         p = 0;
3501     }
3502 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3503     CV_Assert(p);
3504 #endif
3505     return p != 0;
3506 }
3507
3508 bool Kernel::create(const char* kname, const ProgramSource& src,
3509                     const String& buildopts, String* errmsg)
3510 {
3511     if(p)
3512     {
3513         p->release();
3514         p = 0;
3515     }
3516     String tempmsg;
3517     if( !errmsg ) errmsg = &tempmsg;
3518     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3519     return create(kname, prog);
3520 }
3521
3522 void* Kernel::ptr() const
3523 {
3524     return p ? p->handle : 0;
3525 }
3526
3527 bool Kernel::empty() const
3528 {
3529     return ptr() == 0;
3530 }
3531
3532 int Kernel::set(int i, const void* value, size_t sz)
3533 {
3534     if (!p || !p->handle)
3535         return -1;
3536     if (i < 0)
3537         return i;
3538     if( i == 0 )
3539         p->cleanupUMats();
3540
3541     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3542     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
3543     if (retval != CL_SUCCESS)
3544         return -1;
3545     return i+1;
3546 }
3547
3548 int Kernel::set(int i, const Image2D& image2D)
3549 {
3550     p->addImage(image2D);
3551     cl_mem h = (cl_mem)image2D.ptr();
3552     return set(i, &h, sizeof(h));
3553 }
3554
3555 int Kernel::set(int i, const UMat& m)
3556 {
3557     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3558 }
3559
3560 int Kernel::set(int i, const KernelArg& arg)
3561 {
3562     if( !p || !p->handle )
3563         return -1;
3564     if (i < 0)
3565     {
3566         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3567                 p->name.c_str(), (int)i));
3568         return i;
3569     }
3570     if( i == 0 )
3571         p->cleanupUMats();
3572     cl_int status = 0;
3573     if( arg.m )
3574     {
3575         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3576                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3577         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3578         if (ptronly && arg.m->empty())
3579         {
3580             cl_mem h_null = (cl_mem)NULL;
3581             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3582             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3583             return i + 1;
3584         }
3585         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3586
3587         if (!h)
3588         {
3589             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3590                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3591             p->release();
3592             p = 0;
3593             return -1;
3594         }
3595
3596 #ifdef HAVE_OPENCL_SVM
3597         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3598         {
3599             const Context& ctx = Context::getDefault();
3600             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3601             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3602             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3603 #if 1 // TODO
3604             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3605 #else
3606             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3607 #endif
3608             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3609         }
3610         else
3611 #endif
3612         {
3613             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3614             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3615         }
3616
3617         if (ptronly)
3618         {
3619             i++;
3620         }
3621         else if( arg.m->dims <= 2 )
3622         {
3623             UMat2D u2d(*arg.m);
3624             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3625             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3626             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3627             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3628             i += 3;
3629
3630             if( !(arg.flags & KernelArg::NO_SIZE) )
3631             {
3632                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3633                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3634                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3635                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3636                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3637                 i += 2;
3638             }
3639         }
3640         else
3641         {
3642             UMat3D u3d(*arg.m);
3643             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3644             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3645             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3646             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3647             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3648             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3649             i += 4;
3650             if( !(arg.flags & KernelArg::NO_SIZE) )
3651             {
3652                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3653                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3654                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3655                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3656                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3657                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3658                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3659                 i += 3;
3660             }
3661         }
3662         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3663         return i;
3664     }
3665     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3666     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3667     return i+1;
3668 }
3669
3670 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3671                  bool sync, const Queue& q)
3672 {
3673     if (!p)
3674         return false;
3675
3676     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3677     size_t total = 1;
3678     CV_Assert(_globalsize != NULL);
3679     for (int i = 0; i < dims; i++)
3680     {
3681         size_t val = _localsize ? _localsize[i] :
3682             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3683         CV_Assert( val > 0 );
3684         total *= _globalsize[i];
3685         if (_globalsize[i] == 1 && !_localsize)
3686             val = 1;
3687         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3688     }
3689     CV_Assert(total > 0);
3690
3691     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3692 }
3693
3694
3695 static bool isRaiseErrorOnReuseAsyncKernel()
3696 {
3697     static bool initialized = false;
3698     static bool value = false;
3699     if (!initialized)
3700     {
3701         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3702         initialized = true;
3703     }
3704     return value;
3705 }
3706
3707 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3708         bool sync, int64* timeNS, const Queue& q)
3709 {
3710     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3711
3712     if (!handle)
3713     {
3714         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3715         return false;
3716     }
3717
3718     if (isAsyncRun)
3719     {
3720         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3721         if (isRaiseErrorOnReuseAsyncKernel())
3722             CV_Assert(0);
3723         return false;  // OpenCV 5.0: raise error
3724     }
3725     isAsyncRun = !sync;
3726
3727     if (isInProgress)
3728     {
3729         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3730         if (isRaiseErrorOnReuseAsyncKernel())
3731             CV_Assert(0);
3732         return false;  // OpenCV 5.0: raise error
3733     }
3734
3735     cl_command_queue qq = getQueue(q);
3736     if (haveTempDstUMats)
3737         sync = true;
3738     if (haveTempSrcUMats)
3739         sync = true;
3740     if (timeNS)
3741         sync = true;
3742     cl_event asyncEvent = 0;
3743     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3744                                            NULL, globalsize, localsize, 0, 0,
3745                                            (sync && !timeNS) ? 0 : &asyncEvent);
3746 #if !CV_OPENCL_SHOW_RUN_KERNELS
3747     if (retval != CL_SUCCESS)
3748 #endif
3749     {
3750         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3751                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3752                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3753                         sync ? "true" : "false"
3754                         );
3755         if (retval != CL_SUCCESS)
3756         {
3757             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3758         }
3759 #if CV_OPENCL_TRACE_CHECK
3760         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3761 #else
3762         printf("%s\n", msg.c_str());
3763         fflush(stdout);
3764 #endif
3765     }
3766     if (sync || retval != CL_SUCCESS)
3767     {
3768         CV_OCL_DBG_CHECK(clFinish(qq));
3769         if (timeNS)
3770         {
3771             if (retval == CL_SUCCESS)
3772             {
3773                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3774                 cl_ulong startTime, stopTime;
3775                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3776                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3777                 *timeNS = (int64)(stopTime - startTime);
3778             }
3779             else
3780             {
3781                 *timeNS = -1;
3782             }
3783         }
3784         cleanupUMats();
3785     }
3786     else
3787     {
3788         addref();
3789         isInProgress = true;
3790         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3791     }
3792     if (asyncEvent)
3793         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3794     return retval == CL_SUCCESS;
3795 }
3796
3797 bool Kernel::runTask(bool sync, const Queue& q)
3798 {
3799     if(!p || !p->handle || p->isInProgress)
3800         return false;
3801
3802     cl_command_queue qq = getQueue(q);
3803     cl_event asyncEvent = 0;
3804     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3805     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3806     if (sync || retval != CL_SUCCESS)
3807     {
3808         CV_OCL_DBG_CHECK(clFinish(qq));
3809         p->cleanupUMats();
3810     }
3811     else
3812     {
3813         p->addref();
3814         p->isInProgress = true;
3815         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3816     }
3817     if (asyncEvent)
3818         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3819     return retval == CL_SUCCESS;
3820 }
3821
3822 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3823 {
3824     CV_Assert(p && p->handle && !p->isInProgress);
3825     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3826     CV_Assert(q.ptr());
3827     q.finish(); // call clFinish() on base queue
3828     Queue profilingQueue = q.getProfilingQueue();
3829     int64 timeNs = -1;
3830     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3831     return res ? timeNs : -1;
3832 }
3833
3834 size_t Kernel::workGroupSize() const
3835 {
3836     if(!p || !p->handle)
3837         return 0;
3838     size_t val = 0, retsz = 0;
3839     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3840     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3841     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3842     return status == CL_SUCCESS ? val : 0;
3843 }
3844
3845 size_t Kernel::preferedWorkGroupSizeMultiple() const
3846 {
3847     if(!p || !p->handle)
3848         return 0;
3849     size_t val = 0, retsz = 0;
3850     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3851     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3852     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3853     return status == CL_SUCCESS ? val : 0;
3854 }
3855
3856 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3857 {
3858     if(!p || !p->handle || !wsz)
3859         return 0;
3860     size_t retsz = 0;
3861     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3862     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3863     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3864     return status == CL_SUCCESS;
3865 }
3866
3867 size_t Kernel::localMemSize() const
3868 {
3869     if(!p || !p->handle)
3870         return 0;
3871     size_t retsz = 0;
3872     cl_ulong val = 0;
3873     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3874     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3875     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3876     return status == CL_SUCCESS ? (size_t)val : 0;
3877 }
3878
3879
3880
3881 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3882
3883 struct ProgramSource::Impl
3884 {
3885     IMPLEMENT_REFCOUNTABLE();
3886
3887     enum KIND {
3888         PROGRAM_SOURCE_CODE = 0,
3889         PROGRAM_BINARIES,
3890         PROGRAM_SPIR,
3891         PROGRAM_SPIRV
3892     } kind_;
3893
3894     Impl(const String& src)
3895     {
3896         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3897         initFromSource(src, cv::String());
3898     }
3899     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3900     {
3901         init(PROGRAM_SOURCE_CODE, module, name);
3902         initFromSource(codeStr, codeHash);
3903     }
3904
3905     /// reset fields
3906     void init(enum KIND kind, const String& module, const String& name)
3907     {
3908         refcount = 1;
3909         kind_ = kind;
3910         module_ = module;
3911         name_ = name;
3912
3913         sourceAddr_ = NULL;
3914         sourceSize_ = 0;
3915         isHashUpdated = false;
3916     }
3917
3918     void initFromSource(const String& codeStr, const String& codeHash)
3919     {
3920         codeStr_ = codeStr;
3921         sourceHash_ = codeHash;
3922         if (sourceHash_.empty())
3923         {
3924             updateHash();
3925         }
3926         else
3927         {
3928             isHashUpdated = true;
3929         }
3930     }
3931
3932     void updateHash(const char* hashStr = NULL)
3933     {
3934         if (hashStr)
3935         {
3936             sourceHash_ = cv::String(hashStr);
3937             isHashUpdated = true;
3938             return;
3939         }
3940         uint64 hash = 0;
3941         switch (kind_)
3942         {
3943         case PROGRAM_SOURCE_CODE:
3944             if (sourceAddr_)
3945             {
3946                 CV_Assert(codeStr_.empty());
3947                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3948             }
3949             else
3950             {
3951                 CV_Assert(!codeStr_.empty());
3952                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3953             }
3954             break;
3955         case PROGRAM_BINARIES:
3956         case PROGRAM_SPIR:
3957         case PROGRAM_SPIRV:
3958             hash = crc64(sourceAddr_, sourceSize_);
3959             break;
3960         default:
3961             CV_Error(Error::StsInternal, "Internal error");
3962         }
3963         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3964         isHashUpdated = true;
3965     }
3966
3967     Impl(enum KIND kind,
3968             const String& module, const String& name,
3969             const unsigned char* binary, const size_t size,
3970             const cv::String& buildOptions = cv::String())
3971     {
3972         init(kind, module, name);
3973
3974         sourceAddr_ = binary;
3975         sourceSize_ = size;
3976
3977         buildOptions_ = buildOptions;
3978     }
3979
3980     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3981             const char* sourceCodeStaticStr, const char* hashStaticStr,
3982             const cv::String& buildOptions)
3983     {
3984         ProgramSource result;
3985         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3986                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3987         result.p->updateHash(hashStaticStr);
3988         return result;
3989     }
3990
3991     static ProgramSource fromBinary(const String& module, const String& name,
3992             const unsigned char* binary, const size_t size,
3993             const cv::String& buildOptions)
3994     {
3995         ProgramSource result;
3996         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3997         return result;
3998     }
3999
4000     static ProgramSource fromSPIR(const String& module, const String& name,
4001             const unsigned char* binary, const size_t size,
4002             const cv::String& buildOptions)
4003     {
4004         ProgramSource result;
4005         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
4006         return result;
4007     }
4008
4009     String module_;
4010     String name_;
4011
4012     // TODO std::vector<ProgramSource> includes_;
4013     String codeStr_; // PROGRAM_SOURCE_CODE only
4014
4015     const unsigned char* sourceAddr_;
4016     size_t sourceSize_;
4017
4018     cv::String buildOptions_;
4019
4020     String sourceHash_;
4021     bool isHashUpdated;
4022
4023     friend struct Program::Impl;
4024     friend struct internal::ProgramEntry;
4025     friend struct Context::Impl;
4026 };
4027
4028
4029 ProgramSource::ProgramSource()
4030 {
4031     p = 0;
4032 }
4033
4034 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4035 {
4036     p = new Impl(module, name, codeStr, codeHash);
4037 }
4038
4039 ProgramSource::ProgramSource(const char* prog)
4040 {
4041     p = new Impl(prog);
4042 }
4043
4044 ProgramSource::ProgramSource(const String& prog)
4045 {
4046     p = new Impl(prog);
4047 }
4048
4049 ProgramSource::~ProgramSource()
4050 {
4051     if(p)
4052         p->release();
4053 }
4054
4055 ProgramSource::ProgramSource(const ProgramSource& prog)
4056 {
4057     p = prog.p;
4058     if(p)
4059         p->addref();
4060 }
4061
4062 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4063 {
4064     Impl* newp = (Impl*)prog.p;
4065     if(newp)
4066         newp->addref();
4067     if(p)
4068         p->release();
4069     p = newp;
4070     return *this;
4071 }
4072
4073 const String& ProgramSource::source() const
4074 {
4075     CV_Assert(p);
4076     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4077     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4078     return p->codeStr_;
4079 }
4080
4081 ProgramSource::hash_t ProgramSource::hash() const
4082 {
4083     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4084 }
4085
4086 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4087         const unsigned char* binary, const size_t size,
4088         const cv::String& buildOptions)
4089 {
4090     CV_Assert(binary);
4091     CV_Assert(size > 0);
4092     return Impl::fromBinary(module, name, binary, size, buildOptions);
4093 }
4094
4095 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4096         const unsigned char* binary, const size_t size,
4097         const cv::String& buildOptions)
4098 {
4099     CV_Assert(binary);
4100     CV_Assert(size > 0);
4101     return Impl::fromBinary(module, name, binary, size, buildOptions);
4102 }
4103
4104
4105 internal::ProgramEntry::operator ProgramSource&() const
4106 {
4107     if (this->pProgramSource == NULL)
4108     {
4109         cv::AutoLock lock(cv::getInitializationMutex());
4110         if (this->pProgramSource == NULL)
4111         {
4112             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4113             ProgramSource* ptr = new ProgramSource(ps);
4114             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4115         }
4116     }
4117     return *this->pProgramSource;
4118 }
4119
4120
4121
4122 /////////////////////////////////////////// Program /////////////////////////////////////////////
4123
4124 static
4125 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4126 {
4127     if (b.empty())
4128         return a;
4129     if (a.empty())
4130         return b;
4131     if (b[0] == ' ')
4132         return a + b;
4133     return a + (cv::String(" ") + b);
4134 }
4135
4136 struct Program::Impl
4137 {
4138     IMPLEMENT_REFCOUNTABLE();
4139
4140     Impl(const ProgramSource& src,
4141          const String& _buildflags, String& errmsg) :
4142          refcount(1),
4143          handle(NULL),
4144          buildflags(_buildflags)
4145     {
4146         const ProgramSource::Impl* src_ = src.getImpl();
4147         CV_Assert(src_);
4148         sourceModule_ = src_->module_;
4149         sourceName_ = src_->name_;
4150         const Context ctx = Context::getDefault();
4151         Device device = ctx.device(0);
4152         if (ctx.ptr() == NULL || device.ptr() == NULL)
4153             return;
4154         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4155         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4156         {
4157             if (device.isAMD())
4158                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4159             else if (device.isIntel())
4160                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4161             const String param_buildExtraOptions = getBuildExtraOptions();
4162             if (!param_buildExtraOptions.empty())
4163                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4164         }
4165         compile(ctx, src_, errmsg);
4166     }
4167
4168     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4169     {
4170         CV_Assert(ctx.getImpl());
4171         CV_Assert(src_);
4172
4173         // We don't cache OpenCL binaries
4174         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4175         {
4176             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4177             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4178             return isLoaded;
4179         }
4180         return compileWithCache(ctx, src_, errmsg);
4181     }
4182
4183     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4184     {
4185         CV_Assert(ctx.getImpl());
4186         CV_Assert(src_);
4187         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4188
4189 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4190         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4191         const std::string base_dir = config.prepareCacheDirectoryForContext(
4192                 ctx.getImpl()->getPrefixString(),
4193                 ctx.getImpl()->getPrefixBase()
4194         );
4195         const String& hash_str = src_->sourceHash_;
4196         cv::String fname;
4197         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4198         {
4199             CV_Assert(!hash_str.empty());
4200             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4201             fname = utils::fs::join(base_dir, fname);
4202         }
4203         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4204         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4205         {
4206             try
4207             {
4208                 std::vector<char> binaryBuf;
4209                 bool res = false;
4210                 {
4211                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4212                     BinaryProgramFile file(fname, hash_str.c_str());
4213                     res = file.read(buildflags, binaryBuf);
4214                 }
4215                 if (res)
4216                 {
4217                     CV_Assert(!binaryBuf.empty());
4218                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4219                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4220                     if (isLoaded)
4221                         return true;
4222                 }
4223             }
4224             catch (const cv::Exception& e)
4225             {
4226                 CV_UNUSED(e);
4227                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4228             }
4229             catch (...)
4230             {
4231                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4232             }
4233         }
4234 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4235         CV_Assert(handle == NULL);
4236         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4237         {
4238             if (!buildFromSources(ctx, src_, errmsg))
4239             {
4240                 return false;
4241             }
4242         }
4243         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4244         {
4245             buildflags = joinBuildOptions(buildflags, " -x spir");
4246             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4247             {
4248                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4249             }
4250             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4251             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4252             if (!isLoaded)
4253                 return false;
4254         }
4255         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4256         {
4257             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4258         }
4259         else
4260         {
4261             CV_Error(Error::StsInternal, "Internal error");
4262         }
4263         CV_Assert(handle != NULL);
4264 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4265         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4266         {
4267             try
4268             {
4269                 std::vector<char> binaryBuf;
4270                 getProgramBinary(binaryBuf);
4271                 {
4272                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4273                     BinaryProgramFile file(fname, hash_str.c_str());
4274                     file.write(buildflags, binaryBuf);
4275                 }
4276             }
4277             catch (const cv::Exception& e)
4278             {
4279                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4280             }
4281             catch (...)
4282             {
4283                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4284             }
4285         }
4286 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4287 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4288         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4289         {
4290             std::vector<char> binaryBuf;
4291             getProgramBinary(binaryBuf);
4292             if (!binaryBuf.empty())
4293             {
4294                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4295                 handle = NULL;
4296                 createFromBinary(ctx, binaryBuf, errmsg);
4297             }
4298         }
4299 #endif
4300         return handle != NULL;
4301     }
4302
4303     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4304     {
4305         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4306
4307         size_t retsz = 0;
4308         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4309                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4310         if (log_retval == CL_SUCCESS && retsz > 1)
4311         {
4312             buffer.resize(retsz + 16);
4313             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4314                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4315             if (log_retval == CL_SUCCESS)
4316             {
4317                 if (retsz < buffer.size())
4318                     buffer[retsz] = 0;
4319                 else
4320                     buffer[buffer.size() - 1] = 0;
4321             }
4322             else
4323             {
4324                 buffer[0] = 0;
4325             }
4326         }
4327
4328         errmsg = String(buffer.data());
4329         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4330                 sourceModule_.c_str(), sourceName_.c_str(),
4331                 result, getOpenCLErrorString(result),
4332                 buildflags.c_str(), errmsg.c_str());
4333         fflush(stdout);
4334     }
4335
4336     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4337     {
4338         CV_Assert(src_);
4339         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4340         CV_Assert(handle == NULL);
4341         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4342                 sourceModule_.c_str(), sourceName_.c_str(),
4343                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4344
4345         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4346
4347         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4348         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4349         CV_Assert(srcptr != NULL);
4350         CV_Assert(srclen > 0);
4351
4352         cl_int retval = 0;
4353
4354         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4355         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4356         CV_Assert(handle || retval != CL_SUCCESS);
4357         if (handle && retval == CL_SUCCESS)
4358         {
4359             size_t n = ctx.ndevices();
4360             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4361             cl_device_id* deviceList = deviceListBuf.data();
4362             for (size_t i = 0; i < n; i++)
4363             {
4364                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4365             }
4366
4367             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4368             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4369 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4370             if (retval != CL_SUCCESS)
4371 #endif
4372             {
4373                 dumpBuildLog_(retval, deviceList, errmsg);
4374
4375                 // don't remove "retval != CL_SUCCESS" condition here:
4376                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4377                 if (retval != CL_SUCCESS && handle)
4378                 {
4379                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4380                     handle = NULL;
4381                 }
4382             }
4383 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4384             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4385             {
4386                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4387                 size_t retsz = 0;
4388                 char kernels_buffer[4096] = {0};
4389                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4390                 if (retsz < sizeof(kernels_buffer))
4391                     kernels_buffer[retsz] = 0;
4392                 else
4393                     kernels_buffer[0] = 0;
4394                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4395             }
4396 #endif
4397
4398         }
4399         return handle != NULL;
4400     }
4401
4402     void getProgramBinary(std::vector<char>& buf)
4403     {
4404         CV_Assert(handle);
4405         size_t sz = 0;
4406         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4407         buf.resize(sz);
4408         uchar* ptr = (uchar*)&buf[0];
4409         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4410     }
4411
4412     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4413     {
4414         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4415     }
4416
4417     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4418     {
4419         CV_Assert(handle == NULL);
4420         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4421         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4422
4423         CV_Assert(binarySize > 0);
4424
4425         size_t ndevices = (int)ctx.ndevices();
4426         AutoBuffer<cl_device_id> devices_(ndevices);
4427         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4428         AutoBuffer<size_t> binarySizes_(ndevices);
4429
4430         cl_device_id* devices = devices_.data();
4431         const uchar** binaryPtrs = binaryPtrs_.data();
4432         size_t* binarySizes = binarySizes_.data();
4433         for (size_t i = 0; i < ndevices; i++)
4434         {
4435             devices[i] = (cl_device_id)ctx.device(i).ptr();
4436             binaryPtrs[i] = binaryAddr;
4437             binarySizes[i] = binarySize;
4438         }
4439
4440         cl_int result = 0;
4441         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4442                                            binarySizes, binaryPtrs, NULL, &result);
4443         if (result != CL_SUCCESS)
4444         {
4445             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4446             if (handle)
4447             {
4448                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4449                 handle = NULL;
4450             }
4451         }
4452         if (!handle)
4453         {
4454             return false;
4455         }
4456         // call clBuildProgram()
4457         {
4458             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4459             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4460             if (result != CL_SUCCESS)
4461             {
4462                 dumpBuildLog_(result, devices, errmsg);
4463                 if (handle)
4464                 {
4465                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4466                     handle = NULL;
4467                 }
4468                 return false;
4469             }
4470         }
4471         // check build status
4472         {
4473             cl_build_status build_status = CL_BUILD_NONE;
4474             size_t retsz = 0;
4475             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4476                     sizeof(build_status), &build_status, &retsz));
4477             if (result == CL_SUCCESS)
4478             {
4479                 if (build_status == CL_BUILD_SUCCESS)
4480                 {
4481                     return true;
4482                 }
4483                 else
4484                 {
4485                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4486                     return false;
4487                 }
4488             }
4489             else
4490             {
4491                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4492                 if (handle)
4493                 {
4494                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4495                     handle = NULL;
4496                 }
4497             }
4498         }
4499 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4500         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4501         {
4502             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4503             size_t retsz = 0;
4504             char kernels_buffer[4096] = {0};
4505             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4506             if (retsz < sizeof(kernels_buffer))
4507                 kernels_buffer[retsz] = 0;
4508             else
4509                 kernels_buffer[0] = 0;
4510             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4511         }
4512 #endif
4513         return handle != NULL;
4514     }
4515
4516     ~Impl()
4517     {
4518         if( handle )
4519         {
4520 #ifdef _WIN32
4521             if (!cv::__termination)
4522 #endif
4523             {
4524                 clReleaseProgram(handle);
4525             }
4526             handle = NULL;
4527         }
4528     }
4529
4530     cl_program handle;
4531
4532     String buildflags;
4533     String sourceModule_;
4534     String sourceName_;
4535 };
4536
4537
4538 Program::Program() { p = 0; }
4539
4540 Program::Program(const ProgramSource& src,
4541         const String& buildflags, String& errmsg)
4542 {
4543     p = 0;
4544     create(src, buildflags, errmsg);
4545 }
4546
4547 Program::Program(const Program& prog)
4548 {
4549     p = prog.p;
4550     if(p)
4551         p->addref();
4552 }
4553
4554 Program& Program::operator = (const Program& prog)
4555 {
4556     Impl* newp = (Impl*)prog.p;
4557     if(newp)
4558         newp->addref();
4559     if(p)
4560         p->release();
4561     p = newp;
4562     return *this;
4563 }
4564
4565 Program::~Program()
4566 {
4567     if(p)
4568         p->release();
4569 }
4570
4571 bool Program::create(const ProgramSource& src,
4572             const String& buildflags, String& errmsg)
4573 {
4574     if(p)
4575     {
4576         p->release();
4577         p = NULL;
4578     }
4579     p = new Impl(src, buildflags, errmsg);
4580     if(!p->handle)
4581     {
4582         p->release();
4583         p = 0;
4584     }
4585     return p != 0;
4586 }
4587
4588 void* Program::ptr() const
4589 {
4590     return p ? p->handle : 0;
4591 }
4592
4593 #ifndef OPENCV_REMOVE_DEPRECATED_API
4594 const ProgramSource& Program::source() const
4595 {
4596     CV_Error(Error::StsNotImplemented, "Removed API");
4597 }
4598
4599 bool Program::read(const String& bin, const String& buildflags)
4600 {
4601     CV_UNUSED(bin); CV_UNUSED(buildflags);
4602     CV_Error(Error::StsNotImplemented, "Removed API");
4603 }
4604
4605 bool Program::write(String& bin) const
4606 {
4607     CV_UNUSED(bin);
4608     CV_Error(Error::StsNotImplemented, "Removed API");
4609 }
4610
4611 String Program::getPrefix() const
4612 {
4613     if(!p)
4614         return String();
4615     Context::Impl* ctx_ = Context::getDefault().getImpl();
4616     CV_Assert(ctx_);
4617     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4618 }
4619
4620 String Program::getPrefix(const String& buildflags)
4621 {
4622         Context::Impl* ctx_ = Context::getDefault().getImpl();
4623         CV_Assert(ctx_);
4624         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4625 }
4626 #endif // OPENCV_REMOVE_DEPRECATED_API
4627
4628 void Program::getBinary(std::vector<char>& binary) const
4629 {
4630     CV_Assert(p && "Empty program");
4631     p->getProgramBinary(binary);
4632 }
4633
4634 Program Context::Impl::getProg(const ProgramSource& src,
4635                                const String& buildflags, String& errmsg)
4636 {
4637     size_t limit = getProgramCountLimit();
4638     const ProgramSource::Impl* src_ = src.getImpl();
4639     CV_Assert(src_);
4640     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4641             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4642             getPrefixString().c_str(),
4643             buildflags.c_str());
4644     {
4645         cv::AutoLock lock(program_cache_mutex);
4646         phash_t::iterator it = phash.find(key);
4647         if (it != phash.end())
4648         {
4649             // TODO LRU cache
4650             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4651             if (i != cacheList.end() && i != cacheList.begin())
4652             {
4653                 cacheList.erase(i);
4654                 cacheList.push_front(key);
4655             }
4656             return it->second;
4657         }
4658         { // cleanup program cache
4659             size_t sz = phash.size();
4660             if (limit > 0 && sz >= limit)
4661             {
4662                 static bool warningFlag = false;
4663                 if (!warningFlag)
4664                 {
4665                     printf("\nWARNING: OpenCV-OpenCL:\n"
4666                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4667                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4668                     warningFlag = true;
4669                 }
4670                 while (!cacheList.empty())
4671                 {
4672                     size_t c = phash.erase(cacheList.back());
4673                     cacheList.pop_back();
4674                     if (c != 0)
4675                         break;
4676                 }
4677             }
4678         }
4679     }
4680     Program prog(src, buildflags, errmsg);
4681     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4682     {
4683         cv::AutoLock lock(program_cache_mutex);
4684         phash.insert(std::pair<std::string, Program>(key, prog));
4685         cacheList.push_front(key);
4686     }
4687     return prog;
4688 }
4689
4690
4691 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4692
4693 template<typename T>
4694 class OpenCLBufferPool
4695 {
4696 protected:
4697     ~OpenCLBufferPool() { }
4698 public:
4699     virtual T allocate(size_t size) = 0;
4700     virtual void release(T buffer) = 0;
4701 };
4702
4703 template <typename Derived, typename BufferEntry, typename T>
4704 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4705 {
4706 private:
4707     inline Derived& derived() { return *static_cast<Derived*>(this); }
4708 protected:
4709     Mutex mutex_;
4710
4711     size_t currentReservedSize;
4712     size_t maxReservedSize;
4713
4714     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4715     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4716
4717     // synchronized
4718     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4719     {
4720         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4721         for (; i != allocatedEntries_.end(); ++i)
4722         {
4723             BufferEntry& e = *i;
4724             if (e.clBuffer_ == buffer)
4725             {
4726                 entry = e;
4727                 allocatedEntries_.erase(i);
4728                 return true;
4729             }
4730         }
4731         return false;
4732     }
4733
4734     // synchronized
4735     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4736     {
4737         if (reservedEntries_.empty())
4738             return false;
4739         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4740         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4741         BufferEntry result;
4742         size_t minDiff = (size_t)(-1);
4743         for (; i != reservedEntries_.end(); ++i)
4744         {
4745             BufferEntry& e = *i;
4746             if (e.capacity_ >= size)
4747             {
4748                 size_t diff = e.capacity_ - size;
4749                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4750                 {
4751                     minDiff = diff;
4752                     result_pos = i;
4753                     result = e;
4754                     if (diff == 0)
4755                         break;
4756                 }
4757             }
4758         }
4759         if (result_pos != reservedEntries_.end())
4760         {
4761             //CV_DbgAssert(result == *result_pos);
4762             reservedEntries_.erase(result_pos);
4763             entry = result;
4764             currentReservedSize -= entry.capacity_;
4765             allocatedEntries_.push_back(entry);
4766             return true;
4767         }
4768         return false;
4769     }
4770
4771     // synchronized
4772     void _checkSizeOfReservedEntries()
4773     {
4774         while (currentReservedSize > maxReservedSize)
4775         {
4776             CV_DbgAssert(!reservedEntries_.empty());
4777             const BufferEntry& entry = reservedEntries_.back();
4778             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4779             currentReservedSize -= entry.capacity_;
4780             derived()._releaseBufferEntry(entry);
4781             reservedEntries_.pop_back();
4782         }
4783     }
4784
4785     inline size_t _allocationGranularity(size_t size)
4786     {
4787         // heuristic values
4788         if (size < 1024*1024)
4789             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4790         else if (size < 16*1024*1024)
4791             return 64*1024;
4792         else
4793             return 1024*1024;
4794     }
4795
4796 public:
4797     OpenCLBufferPoolBaseImpl()
4798         : currentReservedSize(0),
4799           maxReservedSize(0)
4800     {
4801         // nothing
4802     }
4803     virtual ~OpenCLBufferPoolBaseImpl()
4804     {
4805         freeAllReservedBuffers();
4806         CV_Assert(reservedEntries_.empty());
4807     }
4808 public:
4809     virtual T allocate(size_t size) CV_OVERRIDE
4810     {
4811         AutoLock locker(mutex_);
4812         BufferEntry entry;
4813         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4814         {
4815             CV_DbgAssert(size <= entry.capacity_);
4816             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4817         }
4818         else
4819         {
4820             derived()._allocateBufferEntry(entry, size);
4821         }
4822         return entry.clBuffer_;
4823     }
4824     virtual void release(T buffer) CV_OVERRIDE
4825     {
4826         AutoLock locker(mutex_);
4827         BufferEntry entry;
4828         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4829         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4830         {
4831             derived()._releaseBufferEntry(entry);
4832         }
4833         else
4834         {
4835             reservedEntries_.push_front(entry);
4836             currentReservedSize += entry.capacity_;
4837             _checkSizeOfReservedEntries();
4838         }
4839     }
4840
4841     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4842     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4843     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4844     {
4845         AutoLock locker(mutex_);
4846         size_t oldMaxReservedSize = maxReservedSize;
4847         maxReservedSize = size;
4848         if (maxReservedSize < oldMaxReservedSize)
4849         {
4850             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4851             for (; i != reservedEntries_.end();)
4852             {
4853                 const BufferEntry& entry = *i;
4854                 if (entry.capacity_ > maxReservedSize / 8)
4855                 {
4856                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4857                     currentReservedSize -= entry.capacity_;
4858                     derived()._releaseBufferEntry(entry);
4859                     i = reservedEntries_.erase(i);
4860                     continue;
4861                 }
4862                 ++i;
4863             }
4864             _checkSizeOfReservedEntries();
4865         }
4866     }
4867     virtual void freeAllReservedBuffers() CV_OVERRIDE
4868     {
4869         AutoLock locker(mutex_);
4870         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4871         for (; i != reservedEntries_.end(); ++i)
4872         {
4873             const BufferEntry& entry = *i;
4874             derived()._releaseBufferEntry(entry);
4875         }
4876         reservedEntries_.clear();
4877         currentReservedSize = 0;
4878     }
4879 };
4880
4881 struct CLBufferEntry
4882 {
4883     cl_mem clBuffer_;
4884     size_t capacity_;
4885     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4886 };
4887
4888 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4889 {
4890 public:
4891     typedef struct CLBufferEntry BufferEntry;
4892 protected:
4893     int createFlags_;
4894 public:
4895     OpenCLBufferPoolImpl(int createFlags = 0)
4896         : createFlags_(createFlags)
4897     {
4898     }
4899
4900     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4901     {
4902         CV_DbgAssert(entry.clBuffer_ == NULL);
4903         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4904         Context& ctx = Context::getDefault();
4905         cl_int retval = CL_SUCCESS;
4906         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4907         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4908         CV_Assert(entry.clBuffer_ != NULL);
4909         if(retval == CL_SUCCESS)
4910         {
4911             CV_IMPL_ADD(CV_IMPL_OCL);
4912         }
4913         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4914                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4915         allocatedEntries_.push_back(entry);
4916     }
4917
4918     void _releaseBufferEntry(const BufferEntry& entry)
4919     {
4920         CV_Assert(entry.capacity_ != 0);
4921         CV_Assert(entry.clBuffer_ != NULL);
4922         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4923                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4924         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4925     }
4926 };
4927
4928 #ifdef HAVE_OPENCL_SVM
4929 struct CLSVMBufferEntry
4930 {
4931     void* clBuffer_;
4932     size_t capacity_;
4933     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4934 };
4935 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4936 {
4937 public:
4938     typedef struct CLSVMBufferEntry BufferEntry;
4939 public:
4940     OpenCLSVMBufferPoolImpl()
4941     {
4942     }
4943
4944     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4945     {
4946         CV_DbgAssert(entry.clBuffer_ == NULL);
4947         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4948
4949         Context& ctx = Context::getDefault();
4950         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4951         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4952         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4953                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4954
4955         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4956         CV_DbgAssert(svmFns->isValid());
4957
4958         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4959         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4960         CV_Assert(buf);
4961
4962         entry.clBuffer_ = buf;
4963         {
4964             CV_IMPL_ADD(CV_IMPL_OCL);
4965         }
4966         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4967                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4968         allocatedEntries_.push_back(entry);
4969     }
4970
4971     void _releaseBufferEntry(const BufferEntry& entry)
4972     {
4973         CV_Assert(entry.capacity_ != 0);
4974         CV_Assert(entry.clBuffer_ != NULL);
4975         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4976                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4977         Context& ctx = Context::getDefault();
4978         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4979         CV_DbgAssert(svmFns->isValid());
4980         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4981         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4982     }
4983 };
4984 #endif
4985
4986
4987
4988 template <bool readAccess, bool writeAccess>
4989 class AlignedDataPtr
4990 {
4991 protected:
4992     const size_t size_;
4993     uchar* const originPtr_;
4994     const size_t alignment_;
4995     uchar* ptr_;
4996     uchar* allocatedPtr_;
4997
4998 public:
4999     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
5000         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
5001     {
5002         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5003         CV_DbgAssert(!readAccess || ptr);
5004         if (((size_t)ptr_ & (alignment - 1)) != 0)
5005         {
5006             allocatedPtr_ = new uchar[size_ + alignment - 1];
5007             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5008             if (readAccess)
5009             {
5010                 memcpy(ptr_, originPtr_, size_);
5011             }
5012         }
5013     }
5014
5015     uchar* getAlignedPtr() const
5016     {
5017         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5018         return ptr_;
5019     }
5020
5021     ~AlignedDataPtr()
5022     {
5023         if (allocatedPtr_)
5024         {
5025             if (writeAccess)
5026             {
5027                 memcpy(originPtr_, ptr_, size_);
5028             }
5029             delete[] allocatedPtr_;
5030             allocatedPtr_ = NULL;
5031         }
5032         ptr_ = NULL;
5033     }
5034 private:
5035     AlignedDataPtr(const AlignedDataPtr&); // disabled
5036     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5037 };
5038
5039 template <bool readAccess, bool writeAccess>
5040 class AlignedDataPtr2D
5041 {
5042 protected:
5043     const size_t size_;
5044     uchar* const originPtr_;
5045     const size_t alignment_;
5046     uchar* ptr_;
5047     uchar* allocatedPtr_;
5048     size_t rows_;
5049     size_t cols_;
5050     size_t step_;
5051
5052 public:
5053     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5054         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5055     {
5056         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5057         CV_DbgAssert(!readAccess || ptr != NULL);
5058         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5059         {
5060             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5061             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5062             if (readAccess)
5063             {
5064                 for (size_t i = 0; i < rows_; i++)
5065                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5066             }
5067         }
5068     }
5069
5070     uchar* getAlignedPtr() const
5071     {
5072         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5073         return ptr_;
5074     }
5075
5076     ~AlignedDataPtr2D()
5077     {
5078         if (allocatedPtr_)
5079         {
5080             if (writeAccess)
5081             {
5082                 for (size_t i = 0; i < rows_; i++)
5083                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5084             }
5085             delete[] allocatedPtr_;
5086             allocatedPtr_ = NULL;
5087         }
5088         ptr_ = NULL;
5089     }
5090 private:
5091     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5092     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5093 };
5094
5095 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5096 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5097 #endif
5098
5099
5100 void Context::Impl::__init_buffer_pools()
5101 {
5102     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5103     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5104     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5105     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5106
5107     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5108     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5109     bufferPool.setMaxReservedSize(poolSize);
5110     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5111     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5112
5113 #ifdef HAVE_OPENCL_SVM
5114     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5115     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5116     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5117     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5118 #endif
5119
5120     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5121 }
5122
5123 class OpenCLAllocator CV_FINAL : public MatAllocator
5124 {
5125 public:
5126     enum AllocatorFlags
5127     {
5128         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5129         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5130 #ifdef HAVE_OPENCL_SVM
5131         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5132 #endif
5133         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5134     };
5135
5136     OpenCLAllocator()
5137     {
5138         matStdAllocator = Mat::getDefaultAllocator();
5139     }
5140     ~OpenCLAllocator()
5141     {
5142         flushCleanupQueue();
5143     }
5144
5145     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5146             AccessFlag flags, UMatUsageFlags usageFlags) const
5147     {
5148         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5149         return u;
5150     }
5151
5152     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5153     {
5154         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5155         return value;
5156     }
5157     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5158     {
5159         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5160         return value;
5161     }
5162
5163     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5164     {
5165         const Device& dev = ctx.device(0);
5166         createFlags = 0;
5167         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5168             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5169
5170         if (!isOpenCLCopyingForced() &&
5171             (isOpenCLMapForced() ||
5172                 (dev.hostUnifiedMemory()
5173 #ifndef __APPLE__
5174                 || dev.isIntel()
5175 #endif
5176                 )
5177             )
5178         )
5179             flags0 = static_cast<UMatData::MemoryFlag>(0);
5180         else
5181             flags0 = UMatData::COPY_ON_MAP;
5182     }
5183
5184     UMatData* allocate(int dims, const int* sizes, int type,
5185                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5186     {
5187         if(!useOpenCL())
5188             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5189
5190         flushCleanupQueue();
5191
5192         CV_Assert(data == 0);
5193         size_t total = CV_ELEM_SIZE(type);
5194         for( int i = dims-1; i >= 0; i-- )
5195         {
5196             if( step )
5197                 step[i] = total;
5198             total *= sizes[i];
5199         }
5200
5201         Context& ctx = Context::getDefault();
5202         if (!ctx.getImpl())
5203             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5204         Context::Impl& ctxImpl = *ctx.getImpl();
5205
5206         int createFlags = 0;
5207         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5208         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5209
5210         void* handle = NULL;
5211         int allocatorFlags = 0;
5212
5213 #ifdef HAVE_OPENCL_SVM
5214         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5215         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5216         {
5217             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5218             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5219
5220             // this property is constant, so single buffer pool can be used here
5221             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5222             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5223         }
5224         else
5225 #endif
5226         if (createFlags == 0)
5227         {
5228             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5229             handle = ctxImpl.getBufferPool().allocate(total);
5230         }
5231         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5232         {
5233             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5234             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5235         }
5236         else
5237         {
5238             CV_Assert(handle != NULL); // Unsupported, throw
5239         }
5240
5241         if (!handle)
5242             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5243
5244         UMatData* u = new UMatData(this);
5245         u->data = 0;
5246         u->size = total;
5247         u->handle = handle;
5248         u->flags = flags0;
5249         u->allocatorFlags_ = allocatorFlags;
5250         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5251         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5252         u->markHostCopyObsolete(true);
5253         opencl_allocator_stats.onAllocate(u->size);
5254         return u;
5255     }
5256
5257     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5258     {
5259         if(!u)
5260             return false;
5261
5262         flushCleanupQueue();
5263
5264         UMatDataAutoLock lock(u);
5265
5266         if(u->handle == 0)
5267         {
5268             CV_Assert(u->origdata != 0);
5269             Context& ctx = Context::getDefault();
5270             int createFlags = 0;
5271             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5272             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5273
5274             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5275
5276             cl_context ctx_handle = (cl_context)ctx.ptr();
5277             int allocatorFlags = 0;
5278             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5279             void* handle = NULL;
5280             cl_int retval = CL_SUCCESS;
5281
5282 #ifdef HAVE_OPENCL_SVM
5283             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5284             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5285             if (useSVM && svmCaps.isSupportFineGrainSystem())
5286             {
5287                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5288                 tempUMatFlags = UMatData::TEMP_UMAT;
5289                 handle = u->origdata;
5290                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5291             }
5292             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5293             {
5294                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5295                 {
5296                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5297
5298                     cl_svm_mem_flags memFlags = createFlags |
5299                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5300
5301                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5302                     CV_DbgAssert(svmFns->isValid());
5303
5304                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5305                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5306                     CV_Assert(handle);
5307
5308                     cl_command_queue q = NULL;
5309                     if (!isFineGrainBuffer)
5310                     {
5311                         q = (cl_command_queue)Queue::getDefault().ptr();
5312                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5313                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5314                                 handle, u->size,
5315                                 0, NULL, NULL);
5316                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5317
5318                     }
5319                     memcpy(handle, u->origdata, u->size);
5320                     if (!isFineGrainBuffer)
5321                     {
5322                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5323                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5324                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5325                     }
5326
5327                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5328                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5329                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5330                 }
5331             }
5332             else
5333 #endif
5334             {
5335                 if( copyOnMap )
5336                     accessFlags &= ~ACCESS_FAST;
5337
5338                 tempUMatFlags = UMatData::TEMP_UMAT;
5339                 if (
5340                 #ifdef __APPLE__
5341                     !copyOnMap &&
5342                 #endif
5343                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5344                     // There are OpenCL runtime issues for less aligned data
5345                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5346                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5347                     // Avoid sharing of host memory between OpenCL buffers
5348                     && !(u->originalUMatData && u->originalUMatData->handle)
5349                 )
5350                 {
5351                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5352                                             u->size, u->origdata, &retval);
5353                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5354                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5355                 }
5356                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5357                 {
5358                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5359                                                u->size, u->origdata, &retval);
5360                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5361                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5362                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5363                 }
5364             }
5365             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5366             if(!handle || retval != CL_SUCCESS)
5367                 return false;
5368             u->handle = handle;
5369             u->prevAllocator = u->currAllocator;
5370             u->currAllocator = this;
5371             u->flags |= tempUMatFlags | flags0;
5372             u->allocatorFlags_ = allocatorFlags;
5373         }
5374         if (!!(accessFlags & ACCESS_WRITE))
5375             u->markHostCopyObsolete(true);
5376         opencl_allocator_stats.onAllocate(u->size);
5377         return true;
5378     }
5379
5380     /*void sync(UMatData* u) const
5381     {
5382         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5383         UMatDataAutoLock lock(u);
5384
5385         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5386         {
5387             if( u->tempCopiedUMat() )
5388             {
5389                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5390                                     u->size, u->origdata, 0, 0, 0);
5391             }
5392             else
5393             {
5394                 cl_int retval = 0;
5395                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5396                                                 (CL_MAP_READ | CL_MAP_WRITE),
5397                                                 0, u->size, 0, 0, 0, &retval);
5398                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5399                 clFinish(q);
5400             }
5401             u->markHostCopyObsolete(false);
5402         }
5403         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5404         {
5405             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5406                                  u->size, u->data, 0, 0, 0);
5407         }
5408     }*/
5409
5410     void deallocate(UMatData* u) const CV_OVERRIDE
5411     {
5412         if(!u)
5413             return;
5414
5415         CV_Assert(u->urefcount == 0);
5416         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5417
5418         CV_Assert(u->handle != 0);
5419         CV_Assert(u->mapcount == 0);
5420
5421         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5422             addToCleanupQueue(u);
5423         else
5424             deallocate_(u);
5425     }
5426
5427     void deallocate_(UMatData* u) const
5428     {
5429         CV_Assert(u);
5430         CV_Assert(u->handle);
5431         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5432         {
5433             opencl_allocator_stats.onFree(u->size);
5434         }
5435
5436 #ifdef _WIN32
5437         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5438             return;             // avoid any OpenCL calls
5439 #endif
5440         if(u->tempUMat())
5441         {
5442             CV_Assert(u->origdata);
5443 //            UMatDataAutoLock lock(u);
5444
5445             if (u->hostCopyObsolete())
5446             {
5447 #ifdef HAVE_OPENCL_SVM
5448                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5449                 {
5450                     Context& ctx = Context::getDefault();
5451                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5452                     CV_DbgAssert(svmFns->isValid());
5453
5454                     if( u->tempCopiedUMat() )
5455                     {
5456                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5457                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5458                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5459                         cl_command_queue q = NULL;
5460                         if (!isFineGrainBuffer)
5461                         {
5462                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5463                             q = (cl_command_queue)Queue::getDefault().ptr();
5464                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5465                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5466                                     u->handle, u->size,
5467                                     0, NULL, NULL);
5468                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5469                         }
5470                         clFinish(q);
5471                         memcpy(u->origdata, u->handle, u->size);
5472                         if (!isFineGrainBuffer)
5473                         {
5474                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5475                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5476                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5477                         }
5478                     }
5479                     else
5480                     {
5481                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5482                         // nothing
5483                     }
5484                 }
5485                 else
5486 #endif
5487                 {
5488                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5489                     if( u->tempCopiedUMat() )
5490                     {
5491                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5492                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5493                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5494                     }
5495                     else
5496                     {
5497                         cl_int retval = 0;
5498                         if (u->tempUMat())
5499                         {
5500                             CV_Assert(u->mapcount == 0);
5501                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5502                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5503                                 (CL_MAP_READ | CL_MAP_WRITE),
5504                                 0, u->size, 0, 0, 0, &retval);
5505                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5506                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5507                             if (u->originalUMatData)
5508                             {
5509                                 CV_Assert(u->originalUMatData->data == data);
5510                             }
5511                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5512                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5513                             CV_OCL_DBG_CHECK(clFinish(q));
5514                         }
5515                     }
5516                 }
5517                 u->markHostCopyObsolete(false);
5518             }
5519             else
5520             {
5521                 // nothing
5522             }
5523 #ifdef HAVE_OPENCL_SVM
5524             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5525             {
5526                 if( u->tempCopiedUMat() )
5527                 {
5528                     Context& ctx = Context::getDefault();
5529                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5530                     CV_DbgAssert(svmFns->isValid());
5531
5532                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5533                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5534                 }
5535             }
5536             else
5537 #endif
5538             {
5539                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5540                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5541             }
5542             u->handle = 0;
5543             u->markDeviceCopyObsolete(true);
5544             u->currAllocator = u->prevAllocator;
5545             u->prevAllocator = NULL;
5546             if(u->data && u->copyOnMap() && u->data != u->origdata)
5547                 fastFree(u->data);
5548             u->data = u->origdata;
5549             u->currAllocator->deallocate(u);
5550             u = NULL;
5551         }
5552         else
5553         {
5554             CV_Assert(u->origdata == NULL);
5555             if(u->data && u->copyOnMap() && u->data != u->origdata)
5556             {
5557                 fastFree(u->data);
5558                 u->data = 0;
5559                 u->markHostCopyObsolete(true);
5560             }
5561             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5562             {
5563                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5564                 CV_Assert(pCtx);
5565                 ocl::Context& ctx = *pCtx.get();
5566                 CV_Assert(ctx.getImpl());
5567                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5568             }
5569             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5570             {
5571                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5572                 CV_Assert(pCtx);
5573                 ocl::Context& ctx = *pCtx.get();
5574                 CV_Assert(ctx.getImpl());
5575                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5576             }
5577 #ifdef HAVE_OPENCL_SVM
5578             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5579             {
5580                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5581                 CV_Assert(pCtx);
5582                 ocl::Context& ctx = *pCtx.get();
5583                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5584                 {
5585                     //nothing
5586                 }
5587                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5588                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5589                 {
5590                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5591                     CV_DbgAssert(svmFns->isValid());
5592                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5593
5594                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5595                     {
5596                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5597                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5598                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5599                     }
5600                 }
5601                 CV_Assert(ctx.getImpl());
5602                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5603             }
5604 #endif
5605             else
5606             {
5607                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5608             }
5609             u->handle = 0;
5610             u->markDeviceCopyObsolete(true);
5611             delete u;
5612             u = NULL;
5613         }
5614         CV_Assert(u == NULL);
5615     }
5616
5617     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5618     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5619     {
5620         CV_Assert(u && u->handle);
5621
5622         if (!!(accessFlags & ACCESS_WRITE))
5623             u->markDeviceCopyObsolete(true);
5624
5625         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5626
5627         {
5628             if( !u->copyOnMap() )
5629             {
5630                 // TODO
5631                 // because there can be other map requests for the same UMat with different access flags,
5632                 // we use the universal (read-write) access mode.
5633 #ifdef HAVE_OPENCL_SVM
5634                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5635                 {
5636                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5637                     {
5638                         Context& ctx = Context::getDefault();
5639                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5640                         CV_DbgAssert(svmFns->isValid());
5641
5642                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5643                         {
5644                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5645                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5646                                     u->handle, u->size,
5647                                     0, NULL, NULL);
5648                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5649                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5650                         }
5651                     }
5652                     clFinish(q);
5653                     u->data = (uchar*)u->handle;
5654                     u->markHostCopyObsolete(false);
5655                     u->markDeviceMemMapped(true);
5656                     return;
5657                 }
5658 #endif
5659
5660                 cl_int retval = CL_SUCCESS;
5661                 if (!u->deviceMemMapped())
5662                 {
5663                     CV_Assert(u->refcount == 1);
5664                     CV_Assert(u->mapcount++ == 0);
5665                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5666                                                          (CL_MAP_READ | CL_MAP_WRITE),
5667                                                          0, u->size, 0, 0, 0, &retval);
5668                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5669                 }
5670                 if (u->data && retval == CL_SUCCESS)
5671                 {
5672                     u->markHostCopyObsolete(false);
5673                     u->markDeviceMemMapped(true);
5674                     return;
5675                 }
5676
5677                 // TODO Is it really a good idea and was it tested well?
5678                 // if map failed, switch to copy-on-map mode for the particular buffer
5679                 u->flags |= UMatData::COPY_ON_MAP;
5680             }
5681
5682             if(!u->data)
5683             {
5684                 u->data = (uchar*)fastMalloc(u->size);
5685                 u->markHostCopyObsolete(true);
5686             }
5687         }
5688
5689         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5690         {
5691             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5692 #ifdef HAVE_OPENCL_SVM
5693             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5694 #endif
5695             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5696                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5697             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5698                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5699             u->markHostCopyObsolete(false);
5700         }
5701     }
5702
5703     void unmap(UMatData* u) const CV_OVERRIDE
5704     {
5705         if(!u)
5706             return;
5707
5708
5709         CV_Assert(u->handle != 0);
5710
5711         UMatDataAutoLock autolock(u);
5712
5713         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5714         cl_int retval = 0;
5715         if( !u->copyOnMap() && u->deviceMemMapped() )
5716         {
5717             CV_Assert(u->data != NULL);
5718 #ifdef HAVE_OPENCL_SVM
5719             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5720             {
5721                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5722                 {
5723                     Context& ctx = Context::getDefault();
5724                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5725                     CV_DbgAssert(svmFns->isValid());
5726
5727                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5728                     {
5729                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5730                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5731                                 0, NULL, NULL);
5732                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5733                         clFinish(q);
5734                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5735                     }
5736                 }
5737                 if (u->refcount == 0)
5738                     u->data = 0;
5739                 u->markDeviceCopyObsolete(false);
5740                 u->markHostCopyObsolete(true);
5741                 return;
5742             }
5743 #endif
5744             if (u->refcount == 0)
5745             {
5746                 CV_Assert(u->mapcount-- == 1);
5747                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5748                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5749                 if (Device::getDefault().isAMD())
5750                 {
5751                     // required for multithreaded applications (see stitching test)
5752                     CV_OCL_DBG_CHECK(clFinish(q));
5753                 }
5754                 u->markDeviceMemMapped(false);
5755                 u->data = 0;
5756                 u->markDeviceCopyObsolete(false);
5757                 u->markHostCopyObsolete(true);
5758             }
5759         }
5760         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5761         {
5762             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5763 #ifdef HAVE_OPENCL_SVM
5764             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5765 #endif
5766             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5767                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5768             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5769                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5770             u->markDeviceCopyObsolete(false);
5771             u->markHostCopyObsolete(true);
5772         }
5773     }
5774
5775     bool checkContinuous(int dims, const size_t sz[],
5776                          const size_t srcofs[], const size_t srcstep[],
5777                          const size_t dstofs[], const size_t dststep[],
5778                          size_t& total, size_t new_sz[],
5779                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5780                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5781     {
5782         bool iscontinuous = true;
5783         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5784         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5785         total = sz[dims-1];
5786         for( int i = dims-2; i >= 0; i-- )
5787         {
5788             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5789                 iscontinuous = false;
5790             total *= sz[i];
5791             if( srcofs )
5792                 srcrawofs += srcofs[i]*srcstep[i];
5793             if( dstofs )
5794                 dstrawofs += dstofs[i]*dststep[i];
5795         }
5796
5797         if( !iscontinuous )
5798         {
5799             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5800             if( dims == 2 )
5801             {
5802                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5803                 // we assume that new_... arrays are initialized by caller
5804                 // with 0's, so there is no else branch
5805                 if( srcofs )
5806                 {
5807                     new_srcofs[0] = srcofs[1];
5808                     new_srcofs[1] = srcofs[0];
5809                     new_srcofs[2] = 0;
5810                 }
5811
5812                 if( dstofs )
5813                 {
5814                     new_dstofs[0] = dstofs[1];
5815                     new_dstofs[1] = dstofs[0];
5816                     new_dstofs[2] = 0;
5817                 }
5818
5819                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5820                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5821             }
5822             else
5823             {
5824                 // we could check for dims == 3 here,
5825                 // but from user perspective this one is more informative
5826                 CV_Assert(dims <= 3);
5827                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5828                 if( srcofs )
5829                 {
5830                     new_srcofs[0] = srcofs[2];
5831                     new_srcofs[1] = srcofs[1];
5832                     new_srcofs[2] = srcofs[0];
5833                 }
5834
5835                 if( dstofs )
5836                 {
5837                     new_dstofs[0] = dstofs[2];
5838                     new_dstofs[1] = dstofs[1];
5839                     new_dstofs[2] = dstofs[0];
5840                 }
5841
5842                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5843                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5844             }
5845         }
5846         return iscontinuous;
5847     }
5848
5849     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5850                   const size_t srcofs[], const size_t srcstep[],
5851                   const size_t dststep[]) const CV_OVERRIDE
5852     {
5853         if(!u)
5854             return;
5855         UMatDataAutoLock autolock(u);
5856
5857         if( u->data && !u->hostCopyObsolete() )
5858         {
5859             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5860             return;
5861         }
5862         CV_Assert( u->handle != 0 );
5863
5864         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5865
5866         size_t total = 0, new_sz[] = {0, 0, 0};
5867         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5868         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5869
5870         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5871                                             total, new_sz,
5872                                             srcrawofs, new_srcofs, new_srcstep,
5873                                             dstrawofs, new_dstofs, new_dststep);
5874
5875 #ifdef HAVE_OPENCL_SVM
5876         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5877         {
5878             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5879             Context& ctx = Context::getDefault();
5880             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5881             CV_DbgAssert(svmFns->isValid());
5882
5883             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5884             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5885             {
5886                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5887                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5888                         u->handle, u->size,
5889                         0, NULL, NULL);
5890                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5891             }
5892             clFinish(q);
5893             if( iscontinuous )
5894             {
5895                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5896             }
5897             else
5898             {
5899                 // This code is from MatAllocator::download()
5900                 int isz[CV_MAX_DIM];
5901                 uchar* srcptr = (uchar*)u->handle;
5902                 for( int i = 0; i < dims; i++ )
5903                 {
5904                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5905                     if( sz[i] == 0 )
5906                     return;
5907                     if( srcofs )
5908                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5909                     isz[i] = (int)sz[i];
5910                 }
5911
5912                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5913                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5914
5915                 const Mat* arrays[] = { &src, &dst };
5916                 uchar* ptrs[2];
5917                 NAryMatIterator it(arrays, ptrs, 2);
5918                 size_t j, planesz = it.size;
5919
5920                 for( j = 0; j < it.nplanes; j++, ++it )
5921                     memcpy(ptrs[1], ptrs[0], planesz);
5922             }
5923             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5924             {
5925                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5926                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5927                         0, NULL, NULL);
5928                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5929                 clFinish(q);
5930             }
5931         }
5932         else
5933 #endif
5934         {
5935             if( iscontinuous )
5936             {
5937                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5938                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5939                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5940             }
5941             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5942             {
5943                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5944                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5945                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5946                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5947                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5948                 uchar* ptr = alignedPtr.getAlignedPtr();
5949
5950                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5951                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5952                 total = std::min(total, u->size - new_srcrawofs);
5953                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5954                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5955                 for( size_t i = 0; i < new_sz[1]; i++ )
5956                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5957             }
5958             else
5959             {
5960                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5961                 uchar* ptr = alignedPtr.getAlignedPtr();
5962
5963                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5964                     new_srcofs, new_dstofs, new_sz,
5965                     new_srcstep[0], 0,
5966                     new_dststep[0], 0,
5967                     ptr, 0, 0, 0));
5968             }
5969         }
5970     }
5971
5972     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5973                 const size_t dstofs[], const size_t dststep[],
5974                 const size_t srcstep[]) const CV_OVERRIDE
5975     {
5976         if(!u)
5977             return;
5978
5979         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5980         CV_Assert(u->refcount == 0 || u->tempUMat());
5981
5982         size_t total = 0, new_sz[] = {0, 0, 0};
5983         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5984         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5985
5986         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5987                                             total, new_sz,
5988                                             srcrawofs, new_srcofs, new_srcstep,
5989                                             dstrawofs, new_dstofs, new_dststep);
5990
5991         UMatDataAutoLock autolock(u);
5992
5993         // if there is cached CPU copy of the GPU matrix,
5994         // we could use it as a destination.
5995         // we can do it in 2 cases:
5996         //    1. we overwrite the whole content
5997         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5998         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5999         {
6000             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
6001             u->markHostCopyObsolete(false);
6002             u->markDeviceCopyObsolete(true);
6003             return;
6004         }
6005
6006         CV_Assert( u->handle != 0 );
6007         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6008
6009 #ifdef HAVE_OPENCL_SVM
6010         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6011         {
6012             CV_DbgAssert(u->data == NULL || u->data == u->handle);
6013             Context& ctx = Context::getDefault();
6014             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6015             CV_DbgAssert(svmFns->isValid());
6016
6017             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
6018             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6019             {
6020                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
6021                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
6022                         u->handle, u->size,
6023                         0, NULL, NULL);
6024                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6025             }
6026             clFinish(q);
6027             if( iscontinuous )
6028             {
6029                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6030             }
6031             else
6032             {
6033                 // This code is from MatAllocator::upload()
6034                 int isz[CV_MAX_DIM];
6035                 uchar* dstptr = (uchar*)u->handle;
6036                 for( int i = 0; i < dims; i++ )
6037                 {
6038                     CV_Assert( sz[i] <= (size_t)INT_MAX );
6039                     if( sz[i] == 0 )
6040                     return;
6041                     if( dstofs )
6042                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6043                     isz[i] = (int)sz[i];
6044                 }
6045
6046                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6047                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6048
6049                 const Mat* arrays[] = { &src, &dst };
6050                 uchar* ptrs[2];
6051                 NAryMatIterator it(arrays, ptrs, 2);
6052                 size_t j, planesz = it.size;
6053
6054                 for( j = 0; j < it.nplanes; j++, ++it )
6055                     memcpy(ptrs[1], ptrs[0], planesz);
6056             }
6057             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6058             {
6059                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6060                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6061                         0, NULL, NULL);
6062                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6063                 clFinish(q);
6064             }
6065         }
6066         else
6067 #endif
6068         {
6069             if( iscontinuous )
6070             {
6071                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6072                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6073                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6074                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6075                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6076             }
6077             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6078             {
6079                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6080                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6081                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6082                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6083                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6084                 uchar* ptr = alignedPtr.getAlignedPtr();
6085
6086                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6087                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6088                 total = std::min(total, u->size - new_dstrawofs);
6089                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6090                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6091                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6092                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6093                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6094                 for( size_t i = 0; i < new_sz[1]; i++ )
6095                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6096                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6097                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6098             }
6099             else
6100             {
6101                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6102                 uchar* ptr = alignedPtr.getAlignedPtr();
6103
6104                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6105                     new_dstofs, new_srcofs, new_sz,
6106                     new_dststep[0], 0,
6107                     new_srcstep[0], 0,
6108                     ptr, 0, 0, 0));
6109             }
6110         }
6111         u->markHostCopyObsolete(true);
6112 #ifdef HAVE_OPENCL_SVM
6113         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6114                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6115         {
6116             // nothing
6117         }
6118         else
6119 #endif
6120         {
6121             u->markHostCopyObsolete(true);
6122         }
6123         u->markDeviceCopyObsolete(false);
6124     }
6125
6126     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6127               const size_t srcofs[], const size_t srcstep[],
6128               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6129     {
6130         if(!src || !dst)
6131             return;
6132
6133         size_t total = 0, new_sz[] = {0, 0, 0};
6134         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6135         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6136
6137         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6138                                             total, new_sz,
6139                                             srcrawofs, new_srcofs, new_srcstep,
6140                                             dstrawofs, new_dstofs, new_dststep);
6141
6142         UMatDataAutoLock src_autolock(src, dst);
6143
6144         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6145         {
6146             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6147             return;
6148         }
6149         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6150         {
6151             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6152             dst->markHostCopyObsolete(false);
6153 #ifdef HAVE_OPENCL_SVM
6154             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6155                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6156             {
6157                 // nothing
6158             }
6159             else
6160 #endif
6161             {
6162                 dst->markDeviceCopyObsolete(true);
6163             }
6164             return;
6165         }
6166
6167         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6168         CV_Assert(dst->refcount == 0);
6169         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6170
6171         cl_int retval = CL_SUCCESS;
6172 #ifdef HAVE_OPENCL_SVM
6173         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6174                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6175         {
6176             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6177                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6178             {
6179                 Context& ctx = Context::getDefault();
6180                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6181                 CV_DbgAssert(svmFns->isValid());
6182
6183                 if( iscontinuous )
6184                 {
6185                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6186                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6187                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6188                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6189                             total, 0, NULL, NULL);
6190                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6191                 }
6192                 else
6193                 {
6194                     clFinish(q);
6195                     // This code is from MatAllocator::download()/upload()
6196                     int isz[CV_MAX_DIM];
6197                     uchar* srcptr = (uchar*)src->handle;
6198                     for( int i = 0; i < dims; i++ )
6199                     {
6200                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6201                         if( sz[i] == 0 )
6202                         return;
6203                         if( srcofs )
6204                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6205                         isz[i] = (int)sz[i];
6206                     }
6207                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6208
6209                     uchar* dstptr = (uchar*)dst->handle;
6210                     for( int i = 0; i < dims; i++ )
6211                     {
6212                         if( dstofs )
6213                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6214                     }
6215                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6216
6217                     const Mat* arrays[] = { &m_src, &m_dst };
6218                     uchar* ptrs[2];
6219                     NAryMatIterator it(arrays, ptrs, 2);
6220                     size_t j, planesz = it.size;
6221
6222                     for( j = 0; j < it.nplanes; j++, ++it )
6223                         memcpy(ptrs[1], ptrs[0], planesz);
6224                 }
6225             }
6226             else
6227             {
6228                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6229                 {
6230                     map(src, ACCESS_READ);
6231                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6232                     unmap(src);
6233                 }
6234                 else
6235                 {
6236                     map(dst, ACCESS_WRITE);
6237                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6238                     unmap(dst);
6239                 }
6240             }
6241         }
6242         else
6243 #endif
6244         {
6245             if( iscontinuous )
6246             {
6247                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6248                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6249                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6250                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6251             }
6252             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6253             {
6254                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6255                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6256                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6257                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6258                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6259
6260                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6261                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6262                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6263                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6264                 uchar* srcptr = srcBuf.getAlignedPtr();
6265                 uchar* dstptr = dstBuf.getAlignedPtr();
6266
6267                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6268
6269                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6270                 src_total = std::min(src_total, src->size - new_srcrawofs);
6271                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6272                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6273
6274                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6275                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6276                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6277                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6278
6279                 for( size_t i = 0; i < new_sz[1]; i++ )
6280                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6281                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6282                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6283                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6284             }
6285             else
6286             {
6287                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6288                                                    new_srcofs, new_dstofs, new_sz,
6289                                                    new_srcstep[0], 0,
6290                                                    new_dststep[0], 0,
6291                                                    0, 0, 0));
6292             }
6293         }
6294         if (retval == CL_SUCCESS)
6295         {
6296             CV_IMPL_ADD(CV_IMPL_OCL)
6297         }
6298
6299 #ifdef HAVE_OPENCL_SVM
6300         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6301             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6302         {
6303             // nothing
6304         }
6305         else
6306 #endif
6307         {
6308             dst->markHostCopyObsolete(true);
6309         }
6310         dst->markDeviceCopyObsolete(false);
6311
6312         if( _sync )
6313         {
6314             CV_OCL_DBG_CHECK(clFinish(q));
6315         }
6316     }
6317
6318     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6319     {
6320         ocl::Context ctx = Context::getDefault();
6321         if (ctx.empty())
6322             return NULL;
6323 #ifdef HAVE_OPENCL_SVM
6324         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6325         {
6326             return &ctx.getImpl()->getBufferPoolSVM();
6327         }
6328 #endif
6329         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6330         {
6331             return &ctx.getImpl()->getBufferPoolHostPtr();
6332         }
6333         if (id != NULL && strcmp(id, "OCL") != 0)
6334         {
6335             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6336         }
6337         return &ctx.getImpl()->getBufferPool();
6338     }
6339
6340     MatAllocator* matStdAllocator;
6341
6342     mutable cv::Mutex cleanupQueueMutex;
6343     mutable std::deque<UMatData*> cleanupQueue;
6344
6345     void flushCleanupQueue() const
6346     {
6347         if (!cleanupQueue.empty())
6348         {
6349             std::deque<UMatData*> q;
6350             {
6351                 cv::AutoLock lock(cleanupQueueMutex);
6352                 q.swap(cleanupQueue);
6353             }
6354             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6355             {
6356                 deallocate_(*i);
6357             }
6358         }
6359     }
6360     void addToCleanupQueue(UMatData* u) const
6361     {
6362         //TODO: Validation check: CV_Assert(!u->tempUMat());
6363         {
6364             cv::AutoLock lock(cleanupQueueMutex);
6365             cleanupQueue.push_back(u);
6366         }
6367     }
6368 };
6369
6370 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6371 {
6372     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6373     g_isOpenCVActivated = true;
6374     return g_allocator;
6375 }
6376 MatAllocator* getOpenCLAllocator()
6377 {
6378     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6379 }
6380
6381 }} // namespace cv::ocl
6382
6383
6384 namespace cv {
6385
6386 // three funcs below are implemented in umatrix.cpp
6387 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6388               bool autoSteps = false );
6389 void finalizeHdr(UMat& m);
6390
6391 } // namespace cv
6392
6393
6394 namespace cv { namespace ocl {
6395
6396 /*
6397 // Convert OpenCL buffer memory to UMat
6398 */
6399 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6400 {
6401     int d = 2;
6402     int sizes[] = { rows, cols };
6403
6404     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6405
6406     dst.release();
6407
6408     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6409     dst.usageFlags = USAGE_DEFAULT;
6410
6411     setSize(dst, d, sizes, 0, true);
6412     dst.offset = 0;
6413
6414     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6415     cl_mem_object_type mem_type = 0;
6416
6417     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6418
6419     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6420
6421     size_t total = 0;
6422     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6423
6424     CV_OCL_CHECK(clRetainMemObject(memobj));
6425
6426     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6427     CV_Assert(total >= rows * step);
6428
6429     // attach clBuffer to UMatData
6430     dst.u = new UMatData(getOpenCLAllocator());
6431     dst.u->data            = 0;
6432     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6433     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6434     dst.u->handle          = cl_mem_buffer;
6435     dst.u->origdata        = 0;
6436     dst.u->prevAllocator   = 0;
6437     dst.u->size            = total;
6438
6439     finalizeHdr(dst);
6440     dst.addref();
6441
6442     return;
6443 } // convertFromBuffer()
6444
6445
6446 /*
6447 // Convert OpenCL image2d_t memory to UMat
6448 */
6449 void convertFromImage(void* cl_mem_image, UMat& dst)
6450 {
6451     cl_mem             clImage = (cl_mem)cl_mem_image;
6452     cl_mem_object_type mem_type = 0;
6453
6454     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6455
6456     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6457
6458     cl_image_format fmt = { 0, 0 };
6459     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6460
6461     int depth = CV_8U;
6462     switch (fmt.image_channel_data_type)
6463     {
6464     case CL_UNORM_INT8:
6465     case CL_UNSIGNED_INT8:
6466         depth = CV_8U;
6467         break;
6468
6469     case CL_SNORM_INT8:
6470     case CL_SIGNED_INT8:
6471         depth = CV_8S;
6472         break;
6473
6474     case CL_UNORM_INT16:
6475     case CL_UNSIGNED_INT16:
6476         depth = CV_16U;
6477         break;
6478
6479     case CL_SNORM_INT16:
6480     case CL_SIGNED_INT16:
6481         depth = CV_16S;
6482         break;
6483
6484     case CL_SIGNED_INT32:
6485         depth = CV_32S;
6486         break;
6487
6488     case CL_FLOAT:
6489         depth = CV_32F;
6490         break;
6491
6492     default:
6493         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6494     }
6495
6496     int type = CV_8UC1;
6497     switch (fmt.image_channel_order)
6498     {
6499     case CL_R:
6500         type = CV_MAKE_TYPE(depth, 1);
6501         break;
6502
6503     case CL_RGBA:
6504     case CL_BGRA:
6505     case CL_ARGB:
6506         type = CV_MAKE_TYPE(depth, 4);
6507         break;
6508
6509     default:
6510         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6511         break;
6512     }
6513
6514     size_t step = 0;
6515     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6516
6517     size_t w = 0;
6518     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6519
6520     size_t h = 0;
6521     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6522
6523     dst.create((int)h, (int)w, type);
6524
6525     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6526
6527     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6528
6529     size_t offset = 0;
6530     size_t src_origin[3] = { 0, 0, 0 };
6531     size_t region[3] = { w, h, 1 };
6532     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6533
6534     CV_OCL_CHECK(clFinish(q));
6535
6536     return;
6537 } // convertFromImage()
6538
6539
6540 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6541
6542 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6543 {
6544     cl_uint numDevices = 0;
6545     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6546     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6547     {
6548         CV_OCL_DBG_CHECK_RESULT(status,
6549             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6550     }
6551
6552     if (numDevices == 0)
6553     {
6554         devices.clear();
6555         return;
6556     }
6557
6558     devices.resize((size_t)numDevices);
6559     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6560 }
6561
6562 struct PlatformInfo::Impl
6563 {
6564     Impl(void* id)
6565     {
6566         refcount = 1;
6567         handle = *(cl_platform_id*)id;
6568         getDevices(devices, handle);
6569     }
6570
6571     String getStrProp(cl_platform_info prop) const
6572     {
6573         char buf[1024];
6574         size_t sz=0;
6575         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6576             sz < sizeof(buf) ? String(buf) : String();
6577     }
6578
6579     IMPLEMENT_REFCOUNTABLE();
6580     std::vector<cl_device_id> devices;
6581     cl_platform_id handle;
6582 };
6583
6584 PlatformInfo::PlatformInfo()
6585 {
6586     p = 0;
6587 }
6588
6589 PlatformInfo::PlatformInfo(void* platform_id)
6590 {
6591     p = new Impl(platform_id);
6592 }
6593
6594 PlatformInfo::~PlatformInfo()
6595 {
6596     if(p)
6597         p->release();
6598 }
6599
6600 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6601 {
6602     if (i.p)
6603         i.p->addref();
6604     p = i.p;
6605 }
6606
6607 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6608 {
6609     if (i.p != p)
6610     {
6611         if (i.p)
6612             i.p->addref();
6613         if (p)
6614             p->release();
6615         p = i.p;
6616     }
6617     return *this;
6618 }
6619
6620 int PlatformInfo::deviceNumber() const
6621 {
6622     return p ? (int)p->devices.size() : 0;
6623 }
6624
6625 void PlatformInfo::getDevice(Device& device, int d) const
6626 {
6627     CV_Assert(p && d < (int)p->devices.size() );
6628     if(p)
6629         device.set(p->devices[d]);
6630 }
6631
6632 String PlatformInfo::name() const
6633 {
6634     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6635 }
6636
6637 String PlatformInfo::vendor() const
6638 {
6639     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6640 }
6641
6642 String PlatformInfo::version() const
6643 {
6644     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6645 }
6646
6647 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6648 {
6649     cl_uint numPlatforms = 0;
6650     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6651
6652     if (numPlatforms == 0)
6653     {
6654         platforms.clear();
6655         return;
6656     }
6657
6658     platforms.resize((size_t)numPlatforms);
6659     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6660 }
6661
6662 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6663 {
6664     std::vector<cl_platform_id> platforms;
6665     getPlatforms(platforms);
6666
6667     for (size_t i = 0; i < platforms.size(); i++)
6668         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6669 }
6670
6671 const char* typeToStr(int type)
6672 {
6673     static const char* tab[]=
6674     {
6675         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6676         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6677         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6678         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6679         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6680         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6681         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6682         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6683         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6684     };
6685     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6686     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6687     CV_Assert(result);
6688     return result;
6689 }
6690
6691 const char* memopTypeToStr(int type)
6692 {
6693     static const char* tab[] =
6694     {
6695         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6696         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6697         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6698         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6699         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6700         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6701         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6702         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6703         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6704     };
6705     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6706     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6707     CV_Assert(result);
6708     return result;
6709 }
6710
6711 const char* vecopTypeToStr(int type)
6712 {
6713     static const char* tab[] =
6714     {
6715         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6716         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6717         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6718         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6719         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6720         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6721         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6722         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6723         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6724     };
6725     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6726     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6727     CV_Assert(result);
6728     return result;
6729 }
6730
6731 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6732 {
6733     if( sdepth == ddepth )
6734         return "noconvert";
6735     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6736     if( ddepth >= CV_32F ||
6737         (ddepth == CV_32S && sdepth < CV_32S) ||
6738         (ddepth == CV_16S && sdepth <= CV_8S) ||
6739         (ddepth == CV_16U && sdepth == CV_8U))
6740     {
6741         sprintf(buf, "convert_%s", typestr);
6742     }
6743     else if( sdepth >= CV_32F )
6744         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6745     else
6746         sprintf(buf, "convert_%s_sat", typestr);
6747
6748     return buf;
6749 }
6750
6751 const char* getOpenCLErrorString(int errorCode)
6752 {
6753 #define CV_OCL_CODE(id) case id: return #id
6754 #define CV_OCL_CODE_(id, name) case id: return #name
6755     switch (errorCode)
6756     {
6757     CV_OCL_CODE(CL_SUCCESS);
6758     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6759     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6760     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6761     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6762     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6763     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6764     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6765     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6766     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6767     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6768     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6769     CV_OCL_CODE(CL_MAP_FAILURE);
6770     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6771     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6772     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6773     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6774     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6775     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6776     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6777     CV_OCL_CODE(CL_INVALID_VALUE);
6778     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6779     CV_OCL_CODE(CL_INVALID_PLATFORM);
6780     CV_OCL_CODE(CL_INVALID_DEVICE);
6781     CV_OCL_CODE(CL_INVALID_CONTEXT);
6782     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6783     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6784     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6785     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6786     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6787     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6788     CV_OCL_CODE(CL_INVALID_SAMPLER);
6789     CV_OCL_CODE(CL_INVALID_BINARY);
6790     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6791     CV_OCL_CODE(CL_INVALID_PROGRAM);
6792     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6793     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6794     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6795     CV_OCL_CODE(CL_INVALID_KERNEL);
6796     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6797     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6798     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6799     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6800     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6801     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6802     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6803     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6804     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6805     CV_OCL_CODE(CL_INVALID_EVENT);
6806     CV_OCL_CODE(CL_INVALID_OPERATION);
6807     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6808     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6809     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6810     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6811     // OpenCL 1.1
6812     CV_OCL_CODE(CL_INVALID_PROPERTY);
6813     // OpenCL 1.2
6814     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6815     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6816     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6817     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6818     // OpenCL 2.0
6819     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6820     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6821     // Extensions
6822     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6823     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6824     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6825     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6826     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6827     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6828     default: return "Unknown OpenCL error";
6829     }
6830 #undef CV_OCL_CODE
6831 #undef CV_OCL_CODE_
6832 }
6833
6834 template <typename T>
6835 static std::string kerToStr(const Mat & k)
6836 {
6837     int width = k.cols - 1, depth = k.depth();
6838     const T * const data = k.ptr<T>();
6839
6840     std::ostringstream stream;
6841     stream.precision(10);
6842
6843     if (depth <= CV_8S)
6844     {
6845         for (int i = 0; i < width; ++i)
6846             stream << "DIG(" << (int)data[i] << ")";
6847         stream << "DIG(" << (int)data[width] << ")";
6848     }
6849     else if (depth == CV_32F)
6850     {
6851         stream.setf(std::ios_base::showpoint);
6852         for (int i = 0; i < width; ++i)
6853             stream << "DIG(" << data[i] << "f)";
6854         stream << "DIG(" << data[width] << "f)";
6855     }
6856     else
6857     {
6858         for (int i = 0; i < width; ++i)
6859             stream << "DIG(" << data[i] << ")";
6860         stream << "DIG(" << data[width] << ")";
6861     }
6862
6863     return stream.str();
6864 }
6865
6866 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6867 {
6868     Mat kernel = _kernel.getMat().reshape(1, 1);
6869
6870     int depth = kernel.depth();
6871     if (ddepth < 0)
6872         ddepth = depth;
6873
6874     if (ddepth != depth)
6875         kernel.convertTo(kernel, ddepth);
6876
6877     typedef std::string (* func_t)(const Mat &);
6878     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6879                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6880     const func_t func = funcs[ddepth];
6881     CV_Assert(func != 0);
6882
6883     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6884 }
6885
6886 #define PROCESS_SRC(src) \
6887     do \
6888     { \
6889         if (!src.empty()) \
6890         { \
6891             CV_Assert(src.isMat() || src.isUMat()); \
6892             Size csize = src.size(); \
6893             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6894                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6895             if (cwidth < ckercn || ckercn <= 0) \
6896                 return 1; \
6897             cols.push_back(cwidth); \
6898             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6899                 return 1; \
6900             offsets.push_back(src.offset()); \
6901             steps.push_back(src.step()); \
6902             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6903             kercns.push_back(ckercn); \
6904         } \
6905     } \
6906     while ((void)0, 0)
6907
6908 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6909                               InputArray src4, InputArray src5, InputArray src6,
6910                               InputArray src7, InputArray src8, InputArray src9,
6911                               OclVectorStrategy strat)
6912 {
6913     const ocl::Device & d = ocl::Device::getDefault();
6914
6915     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6916         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6917         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6918         d.preferredVectorWidthDouble(), -1 };
6919
6920     // if the device says don't use vectors
6921     if (vectorWidths[0] == 1)
6922     {
6923         // it's heuristic
6924         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6925         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6926         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6927     }
6928
6929     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6930 }
6931
6932 int checkOptimalVectorWidth(const int *vectorWidths,
6933                             InputArray src1, InputArray src2, InputArray src3,
6934                             InputArray src4, InputArray src5, InputArray src6,
6935                             InputArray src7, InputArray src8, InputArray src9,
6936                             OclVectorStrategy strat)
6937 {
6938     CV_Assert(vectorWidths);
6939
6940     int ref_type = src1.type();
6941
6942     std::vector<size_t> offsets, steps, cols;
6943     std::vector<int> dividers, kercns;
6944     PROCESS_SRC(src1);
6945     PROCESS_SRC(src2);
6946     PROCESS_SRC(src3);
6947     PROCESS_SRC(src4);
6948     PROCESS_SRC(src5);
6949     PROCESS_SRC(src6);
6950     PROCESS_SRC(src7);
6951     PROCESS_SRC(src8);
6952     PROCESS_SRC(src9);
6953
6954     size_t size = offsets.size();
6955
6956     for (size_t i = 0; i < size; ++i)
6957         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6958             dividers[i] >>= 1, kercns[i] >>= 1;
6959
6960     // default strategy
6961     int kercn = *std::min_element(kercns.begin(), kercns.end());
6962
6963     return kercn;
6964 }
6965
6966 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6967                                  InputArray src4, InputArray src5, InputArray src6,
6968                                  InputArray src7, InputArray src8, InputArray src9)
6969 {
6970     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6971 }
6972
6973 #undef PROCESS_SRC
6974
6975
6976 // TODO Make this as a method of OpenCL "BuildOptions" class
6977 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6978 {
6979     if (!buildOptions.empty())
6980         buildOptions += " ";
6981     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6982     buildOptions += format(
6983             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6984             name.c_str(), ocl::typeToStr(type),
6985             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6986             name.c_str(), (int)CV_MAT_CN(type),
6987             name.c_str(), (int)CV_ELEM_SIZE(type),
6988             name.c_str(), (int)CV_ELEM_SIZE1(type),
6989             name.c_str(), (int)depth
6990             );
6991 }
6992
6993
6994 struct Image2D::Impl
6995 {
6996     Impl(const UMat &src, bool norm, bool alias)
6997     {
6998         handle = 0;
6999         refcount = 1;
7000         init(src, norm, alias);
7001     }
7002
7003     ~Impl()
7004     {
7005         if (handle)
7006             clReleaseMemObject(handle);
7007     }
7008
7009     static cl_image_format getImageFormat(int depth, int cn, bool norm)
7010     {
7011         cl_image_format format;
7012         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
7013                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
7014         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
7015                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
7016         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
7017
7018         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
7019         int channelOrder = channelOrders[cn];
7020         format.image_channel_data_type = (cl_channel_type)channelType;
7021         format.image_channel_order = (cl_channel_order)channelOrder;
7022         return format;
7023     }
7024
7025     static bool isFormatSupported(cl_image_format format)
7026     {
7027         if (!haveOpenCL())
7028             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7029
7030         cl_context context = (cl_context)Context::getDefault().ptr();
7031         if (!context)
7032             return false;
7033
7034         // Figure out how many formats are supported by this context.
7035         cl_uint numFormats = 0;
7036         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7037                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
7038                                                 NULL, &numFormats);
7039         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7040         if (numFormats > 0)
7041         {
7042             AutoBuffer<cl_image_format> formats(numFormats);
7043             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7044                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
7045                                              formats.data(), NULL);
7046             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7047             for (cl_uint i = 0; i < numFormats; ++i)
7048             {
7049                 if (!memcmp(&formats[i], &format, sizeof(format)))
7050                 {
7051                     return true;
7052                 }
7053             }
7054         }
7055         return false;
7056     }
7057
7058     void init(const UMat &src, bool norm, bool alias)
7059     {
7060         if (!haveOpenCL())
7061             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7062
7063         CV_Assert(!src.empty());
7064         CV_Assert(ocl::Device::getDefault().imageSupport());
7065
7066         int err, depth = src.depth(), cn = src.channels();
7067         CV_Assert(cn <= 4);
7068         cl_image_format format = getImageFormat(depth, cn, norm);
7069
7070         if (!isFormatSupported(format))
7071             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7072
7073         if (alias && !src.handle(ACCESS_RW))
7074             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7075
7076         cl_context context = (cl_context)Context::getDefault().ptr();
7077         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7078
7079 #ifdef CL_VERSION_1_2
7080         // this enables backwards portability to
7081         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7082         const Device & d = ocl::Device::getDefault();
7083         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7084         CV_Assert(!alias || canCreateAlias(src));
7085         if (1 < major || (1 == major && 2 <= minor))
7086         {
7087             cl_image_desc desc;
7088             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7089             desc.image_width      = src.cols;
7090             desc.image_height     = src.rows;
7091             desc.image_depth      = 0;
7092             desc.image_array_size = 1;
7093             desc.image_row_pitch  = alias ? src.step[0] : 0;
7094             desc.image_slice_pitch = 0;
7095             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7096             desc.num_mip_levels   = 0;
7097             desc.num_samples      = 0;
7098             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7099         }
7100         else
7101 #endif
7102         {
7103             CV_SUPPRESS_DEPRECATED_START
7104             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7105             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7106             CV_SUPPRESS_DEPRECATED_END
7107         }
7108         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7109
7110         size_t origin[] = { 0, 0, 0 };
7111         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7112
7113         cl_mem devData;
7114         if (!alias && !src.isContinuous())
7115         {
7116             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7117             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7118                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7119                 ).c_str());
7120
7121             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7122             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7123                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7124             CV_OCL_DBG_CHECK(clFlush(queue));
7125         }
7126         else
7127         {
7128             devData = (cl_mem)src.handle(ACCESS_READ);
7129         }
7130         CV_Assert(devData != NULL);
7131
7132         if (!alias)
7133         {
7134             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7135             if (!src.isContinuous())
7136             {
7137                 CV_OCL_DBG_CHECK(clFlush(queue));
7138                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7139             }
7140         }
7141     }
7142
7143     IMPLEMENT_REFCOUNTABLE();
7144
7145     cl_mem handle;
7146 };
7147
7148 Image2D::Image2D()
7149 {
7150     p = NULL;
7151 }
7152
7153 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7154 {
7155     p = new Impl(src, norm, alias);
7156 }
7157
7158 bool Image2D::canCreateAlias(const UMat &m)
7159 {
7160     bool ret = false;
7161     const Device & d = ocl::Device::getDefault();
7162     if (d.imageFromBufferSupport() && !m.empty())
7163     {
7164         // This is the required pitch alignment in pixels
7165         uint pitchAlign = d.imagePitchAlignment();
7166         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7167         {
7168             // We don't currently handle the case where the buffer was created
7169             // with CL_MEM_USE_HOST_PTR
7170             if (!m.u->tempUMat())
7171             {
7172                 ret = true;
7173             }
7174         }
7175     }
7176     return ret;
7177 }
7178
7179 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7180 {
7181     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7182
7183     return Impl::isFormatSupported(format);
7184 }
7185
7186 Image2D::Image2D(const Image2D & i)
7187 {
7188     p = i.p;
7189     if (p)
7190         p->addref();
7191 }
7192
7193 Image2D & Image2D::operator = (const Image2D & i)
7194 {
7195     if (i.p != p)
7196     {
7197         if (i.p)
7198             i.p->addref();
7199         if (p)
7200             p->release();
7201         p = i.p;
7202     }
7203     return *this;
7204 }
7205
7206 Image2D::~Image2D()
7207 {
7208     if (p)
7209         p->release();
7210 }
7211
7212 void* Image2D::ptr() const
7213 {
7214     return p ? p->handle : 0;
7215 }
7216
7217 bool internal::isOpenCLForced()
7218 {
7219     static bool initialized = false;
7220     static bool value = false;
7221     if (!initialized)
7222     {
7223         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7224         initialized = true;
7225     }
7226     return value;
7227 }
7228
7229 bool internal::isPerformanceCheckBypassed()
7230 {
7231     static bool initialized = false;
7232     static bool value = false;
7233     if (!initialized)
7234     {
7235         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7236         initialized = true;
7237     }
7238     return value;
7239 }
7240
7241 bool internal::isCLBuffer(UMat& u)
7242 {
7243     void* h = u.handle(ACCESS_RW);
7244     if (!h)
7245         return true;
7246     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7247 #if 1
7248     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7249         return false;
7250 #else
7251     cl_mem_object_type type = 0;
7252     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7253     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7254         return false;
7255 #endif
7256     return true;
7257 }
7258
7259 struct Timer::Impl
7260 {
7261     const Queue queue;
7262
7263     Impl(const Queue& q)
7264         : queue(q)
7265     {
7266     }
7267
7268     ~Impl(){}
7269
7270     void start()
7271     {
7272         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7273         timer.start();
7274     }
7275
7276     void stop()
7277     {
7278         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7279         timer.stop();
7280     }
7281
7282     uint64 durationNS() const
7283     {
7284         return (uint64)(timer.getTimeSec() * 1e9);
7285     }
7286
7287     TickMeter timer;
7288 };
7289
7290 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7291 Timer::~Timer() { delete p; }
7292
7293 void Timer::start()
7294 {
7295     CV_Assert(p);
7296     p->start();
7297 }
7298
7299 void Timer::stop()
7300 {
7301     CV_Assert(p);
7302     p->stop();
7303 }
7304
7305 uint64 Timer::durationNS() const
7306 {
7307     CV_Assert(p);
7308     return p->durationNS();
7309 }
7310
7311 }} // namespace
7312
7313 #ifdef HAVE_DIRECTX
7314 namespace cv { namespace directx { namespace internal {
7315 OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
7316 {
7317     ocl::Context::Impl* i = ctx.getImpl();
7318     CV_Assert(i);
7319     return i->getDirectXImpl();
7320 }
7321 }}} // namespace cv::directx::internal
7322 #endif
7323
7324 #endif // HAVE_OPENCL