Merge pull request #19194 from alalek:intelligent_scissors
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_DIRECTX
117 #include "directx.hpp"
118 #endif
119
120 #ifdef HAVE_OPENCL_SVM
121 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
122 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
123 #include "opencv2/core/opencl/opencl_svm.hpp"
124 #endif
125
126 #include "umatrix.hpp"
127
128 namespace cv { namespace ocl {
129
130 #define IMPLEMENT_REFCOUNTABLE() \
131     void addref() { CV_XADD(&refcount, 1); } \
132     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
133     int refcount
134
135 static cv::utils::AllocatorStatistics opencl_allocator_stats;
136
137 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
138 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
139 {
140     return opencl_allocator_stats;
141 }
142
143 #ifndef _DEBUG
144 static bool isRaiseError()
145 {
146     static bool initialized = false;
147     static bool value = false;
148     if (!initialized)
149     {
150         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
151         initialized = true;
152     }
153     return value;
154 }
155 #endif
156
157 #if CV_OPENCL_TRACE_CHECK
158 static inline
159 void traceOpenCLCheck(cl_int status, const char* message)
160 {
161     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
162 }
163 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
164 #else
165 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
166 #endif
167
168 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
169     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
170
171 #define CV_OCL_CHECK_RESULT(check_result, msg) \
172     do { \
173         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
174         if (check_result != CL_SUCCESS) \
175         { \
176             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
177             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
178             CV_Error(Error::OpenCLApiCallError, error_msg); \
179         } \
180     } while (0)
181
182 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
183
184 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
185
186 #ifdef _DEBUG
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
188 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
189 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
190 #else
191 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
192     do { \
193         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
194         if (check_result != CL_SUCCESS && isRaiseError()) \
195         { \
196             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
197             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
198             CV_Error(Error::OpenCLApiCallError, error_msg); \
199         } \
200     } while (0)
201 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
202 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
203 #endif
204
205
206 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
207 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
208 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
209 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
210
211 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
212 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
213 #endif
214
215 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
216 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
217 #ifdef __APPLE__
218         true
219 #else
220         false
221 #endif
222 );
223
224 static const String getBuildExtraOptions()
225 {
226     static String param_buildExtraOptions;
227     static bool initialized = false;
228     if (!initialized)
229     {
230         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
231         initialized = true;
232         if (!param_buildExtraOptions.empty())
233             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
234     }
235     return param_buildExtraOptions;
236 }
237
238 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
239 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
240
241
242 struct UMat2D
243 {
244     UMat2D(const UMat& m)
245     {
246         offset = (int)m.offset;
247         step = (int)m.step;
248         rows = m.rows;
249         cols = m.cols;
250     }
251     int offset;
252     int step;
253     int rows;
254     int cols;
255 };
256
257 struct UMat3D
258 {
259     UMat3D(const UMat& m)
260     {
261         offset = (int)m.offset;
262         step = (int)m.step.p[1];
263         slicestep = (int)m.step.p[0];
264         slices = (int)m.size.p[0];
265         rows = m.size.p[1];
266         cols = m.size.p[2];
267     }
268     int offset;
269     int slicestep;
270     int step;
271     int slices;
272     int rows;
273     int cols;
274 };
275
276 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
277 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
278 {
279     static uint64 table[256];
280     static bool initialized = false;
281
282     if( !initialized )
283     {
284         for( int i = 0; i < 256; i++ )
285         {
286             uint64 c = i;
287             for( int j = 0; j < 8; j++ )
288                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
289             table[i] = c;
290         }
291         initialized = true;
292     }
293
294     uint64 crc = ~crc0;
295     for( size_t idx = 0; idx < size; idx++ )
296         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
297
298     return ~crc;
299 }
300
301 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
302 struct OpenCLBinaryCacheConfigurator
303 {
304     cv::String cache_path_;
305     cv::String cache_lock_filename_;
306     cv::Ptr<utils::fs::FileLock> cache_lock_;
307
308     typedef std::map<std::string, std::string> ContextCacheType;
309     ContextCacheType prepared_contexts_;
310     Mutex mutex_prepared_contexts_;
311
312     OpenCLBinaryCacheConfigurator()
313     {
314         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
315         if (!CV_OPENCL_CACHE_ENABLE)
316         {
317             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
318             return;
319         }
320         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
321         if (cache_path_.empty())
322         {
323             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
324         }
325         do
326         {
327             try
328             {
329                 if (cache_path_.empty())
330                     break;
331                 if (cache_path_ == "disabled")
332                     break;
333                 if (!utils::fs::createDirectories(cache_path_))
334                 {
335                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
336                     clear();
337                     break;
338                 }
339
340                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
341                 {
342                     cache_lock_filename_ = cache_path_ + ".lock";
343                     if (!utils::fs::exists(cache_lock_filename_))
344                     {
345                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
346                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
347                         if (!lock_filename.is_open())
348                         {
349                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
350                             break;
351                         }
352                     }
353
354                     try
355                     {
356                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
358                         {
359                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
360                         }
361                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
362                     }
363                     catch (const cv::Exception& e)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
366                     }
367                     catch (...)
368                     {
369                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
370                     }
371                 }
372                 else
373                 {
374                     if (CV_OPENCL_CACHE_WRITE)
375                     {
376                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
377                                 "(not safe for multiprocess environment)");
378                     }
379                     else
380                     {
381                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
382                     }
383                 }
384             }
385             catch (const cv::Exception& e)
386             {
387                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
388                 clear();
389             }
390         } while (0);
391         if (!cache_path_.empty())
392         {
393             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
394             {
395                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
396                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
397             }
398             else
399             {
400                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
401             }
402         }
403     }
404
405     void clear()
406     {
407         cache_path_.clear();
408         cache_lock_filename_.clear();
409         cache_lock_.release();
410     }
411
412     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
413             const std::string& cleanup_prefix)
414     {
415         if (cache_path_.empty())
416             return std::string();
417
418         AutoLock lock(mutex_prepared_contexts_);
419
420         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
421         if (found_it != prepared_contexts_.end())
422             return found_it->second;
423
424         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
425
426         std::string target_directory = cache_path_ + ctx_prefix + "/";
427         bool result = utils::fs::isDirectory(target_directory);
428         if (!result)
429         {
430             try
431             {
432                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
433                 if (utils::fs::createDirectories(target_directory))
434                 {
435                     result = true;
436                 }
437                 else
438                 {
439                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
440                 }
441             }
442             catch (const cv::Exception& e)
443             {
444                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
445             }
446         }
447         target_directory = result ? target_directory : std::string();
448         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
449
450         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
451         {
452             try
453             {
454                 std::vector<String> entries;
455                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
456                 std::vector<String> remove_entries;
457                 for (size_t i = 0; i < entries.size(); i++)
458                 {
459                     const String& name = entries[i];
460                     if (0 == name.find(cleanup_prefix))
461                     {
462                         if (0 == name.find(ctx_prefix))
463                             continue; // skip current
464                         remove_entries.push_back(name);
465                     }
466                 }
467                 if (!remove_entries.empty())
468                 {
469                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
470                             ? "Detected OpenCL cache directory for other version of OpenCL device."
471                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
472                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
473                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
474                     for (size_t i = 0; i < remove_entries.size(); i++)
475                     {
476                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
477                     }
478                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
479
480                     for (size_t i = 0; i < remove_entries.size(); i++)
481                     {
482                         const String& name = remove_entries[i];
483                         cv::String path = utils::fs::join(cache_path_, name);
484                         try
485                         {
486                             utils::fs::remove_all(path);
487                             CV_LOG_WARNING(NULL, "Removed: " << path);
488                         }
489                         catch (const cv::Exception& e)
490                         {
491                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
492                         }
493                     }
494                 }
495             }
496             catch (...)
497             {
498                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
499             }
500         }
501
502         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
503         return target_directory;
504     }
505
506     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
507     {
508         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
509     }
510 };
511 class BinaryProgramFile
512 {
513     enum { MAX_ENTRIES = 64 };
514
515     typedef unsigned int uint32_t;
516
517     struct CV_DECL_ALIGNED(4) FileHeader
518     {
519         uint32_t sourceSignatureSize;
520         //char sourceSignature[];
521     };
522
523     struct CV_DECL_ALIGNED(4) FileTable
524     {
525         uint32_t numberOfEntries;
526         //uint32_t firstEntryOffset[];
527     };
528
529     struct CV_DECL_ALIGNED(4) FileEntry
530     {
531         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
532         uint32_t keySize;
533         uint32_t dataSize;
534         //char key[];
535         //char data[];
536     };
537
538     const std::string fileName_;
539     const char* const sourceSignature_;
540     const size_t sourceSignatureSize_;
541
542     std::fstream f;
543
544     uint32_t entryOffsets[MAX_ENTRIES];
545
546     uint32_t getHash(const std::string& options)
547     {
548         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
549         return hash & (MAX_ENTRIES - 1);
550     }
551
552     inline size_t getFileSize()
553     {
554         size_t pos = (size_t)f.tellg();
555         f.seekg(0, std::fstream::end);
556         size_t fileSize = (size_t)f.tellg();
557         f.seekg(pos, std::fstream::beg);
558         return fileSize;
559     }
560     inline uint32_t readUInt32()
561     {
562         uint32_t res = 0;
563         f.read((char*)&res, sizeof(uint32_t));
564         CV_Assert(!f.fail());
565         return res;
566     }
567     inline void writeUInt32(const uint32_t value)
568     {
569         uint32_t v = value;
570         f.write((char*)&v, sizeof(uint32_t));
571         CV_Assert(!f.fail());
572     }
573
574     inline void seekReadAbsolute(size_t pos)
575     {
576         f.seekg(pos, std::fstream::beg);
577         CV_Assert(!f.fail());
578     }
579     inline void seekReadRelative(size_t pos)
580     {
581         f.seekg(pos, std::fstream::cur);
582         CV_Assert(!f.fail());
583     }
584
585     inline void seekWriteAbsolute(size_t pos)
586     {
587         f.seekp(pos, std::fstream::beg);
588         CV_Assert(!f.fail());
589     }
590
591     void clearFile()
592     {
593         f.close();
594         if (0 != remove(fileName_.c_str()))
595             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
596         return;
597     }
598
599 public:
600     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
601         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
602     {
603         CV_StaticAssert(sizeof(uint32_t) == 4, "");
604         CV_Assert(sourceSignature_ != NULL);
605         CV_Assert(sourceSignatureSize_ > 0);
606         memset(entryOffsets, 0, sizeof(entryOffsets));
607
608         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
609         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
610         if(f.is_open() && getFileSize() > 0)
611         {
612             bool isValid = false;
613             try
614             {
615                 uint32_t fileSourceSignatureSize = readUInt32();
616                 if (fileSourceSignatureSize == sourceSignatureSize_)
617                 {
618                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
619                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
620                     if (f.eof())
621                     {
622                         CV_LOG_ERROR(NULL, "Unexpected EOF");
623                     }
624                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
625                     {
626                         isValid = true;
627                     }
628                 }
629                 if (!isValid)
630                 {
631                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
632                 }
633             }
634             catch (const cv::Exception& e)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
637             }
638             catch (...)
639             {
640                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
641             }
642             if (!isValid)
643             {
644                 clearFile();
645             }
646             else
647             {
648                 seekReadAbsolute(0);
649             }
650         }
651     }
652
653     bool read(const std::string& key, std::vector<char>& buf)
654     {
655         if (!f.is_open())
656             return false;
657
658         size_t fileSize = getFileSize();
659         if (fileSize == 0)
660         {
661             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
662             clearFile();
663             return false;
664         }
665         seekReadAbsolute(0);
666
667         // bypass FileHeader
668         uint32_t fileSourceSignatureSize = readUInt32();
669         CV_Assert(fileSourceSignatureSize > 0);
670         seekReadRelative(fileSourceSignatureSize);
671
672         uint32_t numberOfEntries = readUInt32();
673         CV_Assert(numberOfEntries > 0);
674         if (numberOfEntries != MAX_ENTRIES)
675         {
676             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
677             clearFile();
678             return false;
679         }
680         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
681         CV_Assert(!f.fail());
682
683         uint32_t entryNum = getHash(key);
684
685         uint32_t entryOffset = entryOffsets[entryNum];
686         FileEntry entry;
687         while (entryOffset > 0)
688         {
689             seekReadAbsolute(entryOffset);
690             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
691             f.read((char*)&entry, sizeof(entry));
692             CV_Assert(!f.fail());
693             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
694             if (key.size() == entry.keySize)
695             {
696                 if (entry.keySize > 0)
697                 {
698                     f.read(fileKey.data(), entry.keySize);
699                     CV_Assert(!f.fail());
700                 }
701                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
702                 {
703                     buf.resize(entry.dataSize);
704                     f.read(&buf[0], entry.dataSize);
705                     CV_Assert(!f.fail());
706                     seekReadAbsolute(0);
707                     CV_LOG_VERBOSE(NULL, 0, "Read...");
708                     return true;
709                 }
710             }
711             if (entry.nextEntryFileOffset == 0)
712                 break;
713             entryOffset = entry.nextEntryFileOffset;
714         }
715         return false;
716     }
717
718     bool write(const std::string& key, std::vector<char>& buf)
719     {
720         if (!f.is_open())
721         {
722             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
723             if (!f.is_open())
724             {
725                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
726                 if (!f.is_open())
727                 {
728                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
729                     return false;
730                 }
731             }
732         }
733
734         size_t fileSize = getFileSize();
735         if (fileSize == 0)
736         {
737             // Write header
738             seekWriteAbsolute(0);
739             writeUInt32((uint32_t)sourceSignatureSize_);
740             f.write(sourceSignature_, sourceSignatureSize_);
741             CV_Assert(!f.fail());
742
743             writeUInt32(MAX_ENTRIES);
744             memset(entryOffsets, 0, sizeof(entryOffsets));
745             f.write((char*)entryOffsets, sizeof(entryOffsets));
746             CV_Assert(!f.fail());
747             f.flush();
748             CV_Assert(!f.fail());
749             f.close();
750             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
751             CV_Assert(f.is_open());
752             fileSize = getFileSize();
753         }
754         seekReadAbsolute(0);
755
756         // bypass FileHeader
757         uint32_t fileSourceSignatureSize = readUInt32();
758         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
759         seekReadRelative(fileSourceSignatureSize);
760
761         uint32_t numberOfEntries = readUInt32();
762         CV_Assert(numberOfEntries > 0);
763         if (numberOfEntries != MAX_ENTRIES)
764         {
765             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
766             clearFile();
767             return false;
768         }
769         size_t tableEntriesOffset = (size_t)f.tellg();
770         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
771         CV_Assert(!f.fail());
772
773         uint32_t entryNum = getHash(key);
774
775         uint32_t entryOffset = entryOffsets[entryNum];
776         FileEntry entry;
777         while (entryOffset > 0)
778         {
779             seekReadAbsolute(entryOffset);
780             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
781             f.read((char*)&entry, sizeof(entry));
782             CV_Assert(!f.fail());
783             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
784             if (key.size() == entry.keySize)
785             {
786                 if (entry.keySize > 0)
787                 {
788                     f.read(fileKey.data(), entry.keySize);
789                     CV_Assert(!f.fail());
790                 }
791                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
792                 {
793                     // duplicate
794                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
795                     return false;
796                 }
797             }
798             if (entry.nextEntryFileOffset == 0)
799                 break;
800             entryOffset = entry.nextEntryFileOffset;
801         }
802         seekReadAbsolute(0);
803         if (entryOffset > 0)
804         {
805             seekWriteAbsolute(entryOffset);
806             entry.nextEntryFileOffset = (uint32_t)fileSize;
807             f.write((char*)&entry, sizeof(entry));
808             CV_Assert(!f.fail());
809         }
810         else
811         {
812             entryOffsets[entryNum] = (uint32_t)fileSize;
813             seekWriteAbsolute(tableEntriesOffset);
814             f.write((char*)entryOffsets, sizeof(entryOffsets));
815             CV_Assert(!f.fail());
816         }
817         seekWriteAbsolute(fileSize);
818         entry.nextEntryFileOffset = 0;
819         entry.dataSize = (uint32_t)buf.size();
820         entry.keySize = (uint32_t)key.size();
821         f.write((char*)&entry, sizeof(entry));
822         CV_Assert(!f.fail());
823         f.write(key.c_str(), entry.keySize);
824         CV_Assert(!f.fail());
825         f.write(&buf[0], entry.dataSize);
826         CV_Assert(!f.fail());
827         f.flush();
828         CV_Assert(!f.fail());
829         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
830         return true;
831     }
832 };
833 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
834
835
836
837 struct OpenCLExecutionContext::Impl
838 {
839     ocl::Context context_;
840     int device_;  // device index in context
841     ocl::Queue queue_;
842     int useOpenCL_;
843
844 protected:
845     Impl() = delete;
846
847     void _init_device(cl_device_id deviceID)
848     {
849         CV_Assert(deviceID);
850         int ndevices = (int)context_.ndevices();
851         CV_Assert(ndevices > 0);
852         bool found = false;
853         for (int i = 0; i < ndevices; i++)
854         {
855             ocl::Device d = context_.device(i);
856             cl_device_id dhandle = (cl_device_id)d.ptr();
857             if (dhandle == deviceID)
858             {
859                 device_ = i;
860                 found = true;
861                 break;
862             }
863         }
864         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
865     }
866
867     void _init_device(const ocl::Device& device)
868     {
869         CV_Assert(device.ptr());
870         int ndevices = (int)context_.ndevices();
871         CV_Assert(ndevices > 0);
872         bool found = false;
873         for (int i = 0; i < ndevices; i++)
874         {
875             ocl::Device d = context_.device(i);
876             if (d.getImpl() == device.getImpl())
877             {
878                 device_ = i;
879                 found = true;
880                 break;
881             }
882         }
883         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
884     }
885
886 public:
887     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
888         : device_(0), useOpenCL_(-1)
889     {
890         CV_UNUSED(platformID);
891         CV_Assert(context);
892         CV_Assert(deviceID);
893
894         context_ = Context::fromHandle(context);
895         _init_device(deviceID);
896         queue_ = Queue(context_, context_.device(device_));
897     }
898
899     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
900         : device_(0), useOpenCL_(-1)
901     {
902         CV_Assert(context.ptr());
903         CV_Assert(device.ptr());
904
905         context_ = context;
906         _init_device(device);
907         queue_ = queue;
908     }
909
910     Impl(const ocl::Context& context, const ocl::Device& device)
911         : device_(0), useOpenCL_(-1)
912     {
913         CV_Assert(context.ptr());
914         CV_Assert(device.ptr());
915
916         context_ = context;
917         _init_device(device);
918         queue_ = Queue(context_, context_.device(device_));
919     }
920
921     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
922         : context_(context)
923         , device_(device)
924         , queue_(queue)
925         , useOpenCL_(-1)
926     {
927         // nothing
928     }
929     Impl(const Impl& other)
930         : context_(other.context_)
931         , device_(other.device_)
932         , queue_(other.queue_)
933         , useOpenCL_(-1)
934     {
935         // nothing
936     }
937
938     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
939     bool useOpenCL()
940     {
941         if (useOpenCL_ < 0)
942         {
943             try
944             {
945                 useOpenCL_ = 0;
946                 if (!context_.empty() && context_.ndevices() > 0)
947                 {
948                     const Device& d = context_.device(device_);
949                     useOpenCL_ = d.available();
950                 }
951             }
952             catch (const cv::Exception&)
953             {
954                 // nothing
955             }
956             if (!useOpenCL_)
957                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
958         }
959         return useOpenCL_ > 0;
960     }
961
962     void setUseOpenCL(bool flag)
963     {
964         if (!flag)
965             useOpenCL_ = 0;
966         else
967             useOpenCL_ = -1;
968     }
969
970     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
971     {
972         CV_TRACE_FUNCTION();
973
974         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
975
976         static bool initialized = false;
977         static std::shared_ptr<Impl> g_primaryExecutionContext;
978
979         if (!initialized)
980         {
981             cv::AutoLock lock(getInitializationMutex());
982             if (!initialized)
983             {
984                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
985                 try
986                 {
987                     Context c = ocl::Context::create(std::string());
988                     if (c.ndevices())
989                     {
990                         int deviceId = 0;
991                         auto& d = c.device(deviceId);
992                         if (d.available())
993                         {
994                             auto q = ocl::Queue(c, d);
995                             if (!q.ptr())
996                             {
997                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
998                             }
999                             else
1000                             {
1001                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
1002                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
1003                             }
1004                         }
1005                         else
1006                         {
1007                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1008                         }
1009                     }
1010                     else
1011                     {
1012                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1013                     }
1014                 }
1015                 catch (const std::exception& e)
1016                 {
1017                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1018                 }
1019                 catch (...)
1020                 {
1021                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1022                 }
1023                 initialized = true;
1024             }
1025         }
1026         return g_primaryExecutionContext;
1027     }
1028 };
1029
1030 Context& OpenCLExecutionContext::getContext() const
1031 {
1032     CV_Assert(p);
1033     return p->context_;
1034 }
1035 Device& OpenCLExecutionContext::getDevice() const
1036 {
1037     CV_Assert(p);
1038     return p->context_.device(p->device_);
1039 }
1040 Queue& OpenCLExecutionContext::getQueue() const
1041 {
1042     CV_Assert(p);
1043     return p->queue_;
1044 }
1045
1046 bool OpenCLExecutionContext::useOpenCL() const
1047 {
1048     if (p)
1049         return p->useOpenCL();
1050     return false;
1051 }
1052 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1053 {
1054     CV_Assert(p);
1055     p->setUseOpenCL(flag);
1056 }
1057
1058 /* static */
1059 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1060 {
1061     CV_TRACE_FUNCTION();
1062     CoreTLSData& data = getCoreTlsData();
1063     OpenCLExecutionContext& c = data.oclExecutionContext;
1064     if (!data.oclExecutionContextInitialized)
1065     {
1066         data.oclExecutionContextInitialized = true;
1067         if (c.empty() && haveOpenCL())
1068             c.p = Impl::getInitializedExecutionContext();
1069     }
1070     return c;
1071 }
1072
1073 /* static */
1074 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1075 {
1076     CV_TRACE_FUNCTION();
1077     CoreTLSData& data = getCoreTlsData();
1078     OpenCLExecutionContext& c = data.oclExecutionContext;
1079     return c;
1080 }
1081
1082 void OpenCLExecutionContext::bind() const
1083 {
1084     CV_TRACE_FUNCTION();
1085     CV_Assert(p);
1086     CoreTLSData& data = getCoreTlsData();
1087     data.oclExecutionContext = *this;
1088     data.oclExecutionContextInitialized = true;
1089     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1090 }
1091
1092
1093 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1094 {
1095     CV_TRACE_FUNCTION();
1096     CV_Assert(p);
1097     const Queue q(getContext(), getDevice());
1098     return cloneWithNewQueue(q);
1099 }
1100
1101 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1102 {
1103     CV_TRACE_FUNCTION();
1104     CV_Assert(p);
1105     CV_Assert(q.ptr() != NULL);
1106     OpenCLExecutionContext c;
1107     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1108     return c;
1109 }
1110
1111 /* static */
1112 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1113 {
1114     CV_TRACE_FUNCTION();
1115     if (!haveOpenCL())
1116         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1117
1118     CV_Assert(!context.empty());
1119     CV_Assert(context.ptr());
1120     CV_Assert(!device.empty());
1121     CV_Assert(device.ptr());
1122     OpenCLExecutionContext ctx;
1123     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1124     return ctx;
1125
1126 }
1127
1128 /* static */
1129 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1130 {
1131     CV_TRACE_FUNCTION();
1132     if (!haveOpenCL())
1133         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1134
1135     CV_Assert(!context.empty());
1136     CV_Assert(context.ptr());
1137     CV_Assert(!device.empty());
1138     CV_Assert(device.ptr());
1139     OpenCLExecutionContext ctx;
1140     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1141     return ctx;
1142
1143 }
1144
1145 void OpenCLExecutionContext::release()
1146 {
1147     CV_TRACE_FUNCTION();
1148     p.reset();
1149 }
1150
1151
1152 // true if we have initialized OpenCL subsystem with available platforms
1153 static bool g_isOpenCLActivated = false;
1154
1155 bool haveOpenCL()
1156 {
1157     CV_TRACE_FUNCTION();
1158     static bool g_isOpenCLInitialized = false;
1159     static bool g_isOpenCLAvailable = false;
1160
1161     if (!g_isOpenCLInitialized)
1162     {
1163         CV_TRACE_REGION("Init_OpenCL_Runtime");
1164         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1165         if (envPath)
1166         {
1167             if (cv::String(envPath) == "disabled")
1168             {
1169                 g_isOpenCLAvailable = false;
1170                 g_isOpenCLInitialized = true;
1171                 return false;
1172             }
1173         }
1174
1175         cv::AutoLock lock(getInitializationMutex());
1176         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1177         try
1178         {
1179             cl_uint n = 0;
1180             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1181             g_isOpenCLActivated = n > 0;
1182             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1183         }
1184         catch (...)
1185         {
1186             g_isOpenCLAvailable = false;
1187         }
1188         g_isOpenCLInitialized = true;
1189     }
1190     return g_isOpenCLAvailable;
1191 }
1192
1193 bool useOpenCL()
1194 {
1195     CoreTLSData& data = getCoreTlsData();
1196     if (data.useOpenCL < 0)
1197     {
1198         try
1199         {
1200             data.useOpenCL = 0;
1201             if (haveOpenCL())
1202             {
1203                 auto c = OpenCLExecutionContext::getCurrent();
1204                 data.useOpenCL = c.useOpenCL();
1205             }
1206         }
1207         catch (...)
1208         {
1209             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1210         }
1211     }
1212     return data.useOpenCL > 0;
1213 }
1214
1215 bool isOpenCLActivated()
1216 {
1217     if (!g_isOpenCLActivated)
1218         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1219     return useOpenCL();
1220 }
1221
1222 void setUseOpenCL(bool flag)
1223 {
1224     CV_TRACE_FUNCTION();
1225
1226     CoreTLSData& data = getCoreTlsData();
1227     auto& c = OpenCLExecutionContext::getCurrentRef();
1228     if (!c.empty())
1229     {
1230         c.setUseOpenCL(flag);
1231         data.useOpenCL = c.useOpenCL();
1232     }
1233     else
1234     {
1235         if (!flag)
1236             data.useOpenCL = 0;
1237         else
1238             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1239     }
1240 }
1241
1242
1243
1244 #ifdef HAVE_CLAMDBLAS
1245
1246 class AmdBlasHelper
1247 {
1248 public:
1249     static AmdBlasHelper & getInstance()
1250     {
1251         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1252     }
1253
1254     bool isAvailable() const
1255     {
1256         return g_isAmdBlasAvailable;
1257     }
1258
1259     ~AmdBlasHelper()
1260     {
1261         try
1262         {
1263             clAmdBlasTeardown();
1264         }
1265         catch (...) { }
1266     }
1267
1268 protected:
1269     AmdBlasHelper()
1270     {
1271         if (!g_isAmdBlasInitialized)
1272         {
1273             AutoLock lock(getInitializationMutex());
1274
1275             if (!g_isAmdBlasInitialized)
1276             {
1277                 if (haveOpenCL())
1278                 {
1279                     try
1280                     {
1281                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1282                     }
1283                     catch (...)
1284                     {
1285                         g_isAmdBlasAvailable = false;
1286                     }
1287                 }
1288                 else
1289                     g_isAmdBlasAvailable = false;
1290
1291                 g_isAmdBlasInitialized = true;
1292             }
1293         }
1294     }
1295
1296 private:
1297     static bool g_isAmdBlasInitialized;
1298     static bool g_isAmdBlasAvailable;
1299 };
1300
1301 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1302 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1303
1304 bool haveAmdBlas()
1305 {
1306     return AmdBlasHelper::getInstance().isAvailable();
1307 }
1308
1309 #else
1310
1311 bool haveAmdBlas()
1312 {
1313     return false;
1314 }
1315
1316 #endif
1317
1318 #ifdef HAVE_CLAMDFFT
1319
1320 class AmdFftHelper
1321 {
1322 public:
1323     static AmdFftHelper & getInstance()
1324     {
1325         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1326     }
1327
1328     bool isAvailable() const
1329     {
1330         return g_isAmdFftAvailable;
1331     }
1332
1333     ~AmdFftHelper()
1334     {
1335         try
1336         {
1337 //            clAmdFftTeardown();
1338         }
1339         catch (...) { }
1340     }
1341
1342 protected:
1343     AmdFftHelper()
1344     {
1345         if (!g_isAmdFftInitialized)
1346         {
1347             AutoLock lock(getInitializationMutex());
1348
1349             if (!g_isAmdFftInitialized)
1350             {
1351                 if (haveOpenCL())
1352                 {
1353                     try
1354                     {
1355                         cl_uint major, minor, patch;
1356                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1357
1358                         // it throws exception in case AmdFft binaries are not found
1359                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1360                         g_isAmdFftAvailable = true;
1361                     }
1362                     catch (const Exception &)
1363                     {
1364                         g_isAmdFftAvailable = false;
1365                     }
1366                 }
1367                 else
1368                     g_isAmdFftAvailable = false;
1369
1370                 g_isAmdFftInitialized = true;
1371             }
1372         }
1373     }
1374
1375 private:
1376     static clAmdFftSetupData setupData;
1377     static bool g_isAmdFftInitialized;
1378     static bool g_isAmdFftAvailable;
1379 };
1380
1381 clAmdFftSetupData AmdFftHelper::setupData;
1382 bool AmdFftHelper::g_isAmdFftAvailable = false;
1383 bool AmdFftHelper::g_isAmdFftInitialized = false;
1384
1385 bool haveAmdFft()
1386 {
1387     return AmdFftHelper::getInstance().isAvailable();
1388 }
1389
1390 #else
1391
1392 bool haveAmdFft()
1393 {
1394     return false;
1395 }
1396
1397 #endif
1398
1399 bool haveSVM()
1400 {
1401 #ifdef HAVE_OPENCL_SVM
1402     return true;
1403 #else
1404     return false;
1405 #endif
1406 }
1407
1408 void finish()
1409 {
1410     Queue::getDefault().finish();
1411 }
1412
1413 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1414
1415 struct Platform::Impl
1416 {
1417     Impl()
1418     {
1419         refcount = 1;
1420         handle = 0;
1421         initialized = false;
1422     }
1423
1424     ~Impl() {}
1425
1426     void init()
1427     {
1428         if( !initialized )
1429         {
1430             //cl_uint num_entries
1431             cl_uint n = 0;
1432             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1433                 handle = 0;
1434             if( handle != 0 )
1435             {
1436                 char buf[1000];
1437                 size_t len = 0;
1438                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1439                 buf[len] = '\0';
1440                 vendor = String(buf);
1441             }
1442
1443             initialized = true;
1444         }
1445     }
1446
1447     IMPLEMENT_REFCOUNTABLE();
1448
1449     cl_platform_id handle;
1450     String vendor;
1451     bool initialized;
1452 };
1453
1454 Platform::Platform()
1455 {
1456     p = 0;
1457 }
1458
1459 Platform::~Platform()
1460 {
1461     if(p)
1462         p->release();
1463 }
1464
1465 Platform::Platform(const Platform& pl)
1466 {
1467     p = (Impl*)pl.p;
1468     if(p)
1469         p->addref();
1470 }
1471
1472 Platform& Platform::operator = (const Platform& pl)
1473 {
1474     Impl* newp = (Impl*)pl.p;
1475     if(newp)
1476         newp->addref();
1477     if(p)
1478         p->release();
1479     p = newp;
1480     return *this;
1481 }
1482
1483 void* Platform::ptr() const
1484 {
1485     return p ? p->handle : 0;
1486 }
1487
1488 Platform& Platform::getDefault()
1489 {
1490     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1491     static Platform p;
1492     if( !p.p )
1493     {
1494         p.p = new Impl;
1495         p.p->init();
1496     }
1497     return p;
1498 }
1499
1500 /////////////////////////////////////// Device ////////////////////////////////////////////
1501
1502 // Version has format:
1503 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1504 // by specification
1505 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1506 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1507 //   https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html
1508 //   https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html
1509 static void parseOpenCLVersion(const String &version, int &major, int &minor)
1510 {
1511     major = minor = 0;
1512     if (10 >= version.length())
1513         return;
1514     const char *pstr = version.c_str();
1515     if (0 != strncmp(pstr, "OpenCL ", 7))
1516         return;
1517     size_t ppos = version.find('.', 7);
1518     if (String::npos == ppos)
1519         return;
1520     String temp = version.substr(7, ppos - 7);
1521     major = atoi(temp.c_str());
1522     temp = version.substr(ppos + 1);
1523     minor = atoi(temp.c_str());
1524 }
1525
1526 struct Device::Impl
1527 {
1528     Impl(void* d)
1529         : refcount(1)
1530         , handle(0)
1531     {
1532         try
1533         {
1534             cl_device_id device = (cl_device_id)d;
1535             _init(device);
1536             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1537         }
1538         catch (...)
1539         {
1540             throw;
1541         }
1542     }
1543
1544     void _init(cl_device_id d)
1545     {
1546         handle = (cl_device_id)d;
1547
1548         name_ = getStrProp(CL_DEVICE_NAME);
1549         version_ = getStrProp(CL_DEVICE_VERSION);
1550         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1551         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1552         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1553         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1554         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1555         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1556         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1557         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1558
1559         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1560         parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1561
1562         size_t pos = 0;
1563         while (pos < extensions_.size())
1564         {
1565             size_t pos2 = extensions_.find(' ', pos);
1566             if (pos2 == String::npos)
1567                 pos2 = extensions_.size();
1568             if (pos2 > pos)
1569             {
1570                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1571                 extensions_set_.insert(extensionName);
1572             }
1573             pos = pos2 + 1;
1574         }
1575
1576         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1577
1578         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1579         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1580             vendorName_ == "AMD")
1581             vendorID_ = VENDOR_AMD;
1582         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1583             vendorID_ = VENDOR_INTEL;
1584         else if (vendorName_ == "NVIDIA Corporation")
1585             vendorID_ = VENDOR_NVIDIA;
1586         else
1587             vendorID_ = UNKNOWN_VENDOR;
1588
1589         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1590         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1591         {
1592             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1593             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1594                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1595             maxWorkGroupSize_ = new_maxWorkGroupSize;
1596         }
1597 #if 0
1598         if (isExtensionSupported("cl_khr_spir"))
1599         {
1600 #ifndef CL_DEVICE_SPIR_VERSIONS
1601 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1602 #endif
1603             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1604             std::cout << spir_versions << std::endl;
1605         }
1606 #endif
1607     }
1608
1609     ~Impl()
1610     {
1611 #ifdef _WIN32
1612         if (!cv::__termination)
1613 #endif
1614         {
1615             if (handle)
1616             {
1617                 CV_OCL_CHECK(clReleaseDevice(handle));
1618                 handle = 0;
1619             }
1620         }
1621     }
1622
1623     template<typename _TpCL, typename _TpOut>
1624     _TpOut getProp(cl_device_info prop) const
1625     {
1626         _TpCL temp=_TpCL();
1627         size_t sz = 0;
1628
1629         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1630             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1631     }
1632
1633     bool getBoolProp(cl_device_info prop) const
1634     {
1635         cl_bool temp = CL_FALSE;
1636         size_t sz = 0;
1637
1638         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1639             sz == sizeof(temp) ? temp != 0 : false;
1640     }
1641
1642     String getStrProp(cl_device_info prop) const
1643     {
1644         char buf[4096];
1645         size_t sz=0;
1646         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1647             sz < sizeof(buf) ? String(buf) : String();
1648     }
1649
1650     bool isExtensionSupported(const std::string& extensionName) const
1651     {
1652         return extensions_set_.count(extensionName) > 0;
1653     }
1654
1655
1656     IMPLEMENT_REFCOUNTABLE();
1657
1658     cl_device_id handle;
1659
1660     String name_;
1661     String version_;
1662     std::string extensions_;
1663     int doubleFPConfig_;
1664     bool hostUnifiedMemory_;
1665     int maxComputeUnits_;
1666     size_t maxWorkGroupSize_;
1667     int type_;
1668     int addressBits_;
1669     int deviceVersionMajor_;
1670     int deviceVersionMinor_;
1671     String driverVersion_;
1672     String vendorName_;
1673     int vendorID_;
1674     bool intelSubgroupsSupport_;
1675
1676     std::set<std::string> extensions_set_;
1677 };
1678
1679
1680 Device::Device()
1681 {
1682     p = 0;
1683 }
1684
1685 Device::Device(void* d)
1686 {
1687     p = 0;
1688     set(d);
1689 }
1690
1691 Device::Device(const Device& d)
1692 {
1693     p = d.p;
1694     if(p)
1695         p->addref();
1696 }
1697
1698 Device& Device::operator = (const Device& d)
1699 {
1700     Impl* newp = (Impl*)d.p;
1701     if(newp)
1702         newp->addref();
1703     if(p)
1704         p->release();
1705     p = newp;
1706     return *this;
1707 }
1708
1709 Device::~Device()
1710 {
1711     if(p)
1712         p->release();
1713 }
1714
1715 void Device::set(void* d)
1716 {
1717     if(p)
1718         p->release();
1719     p = new Impl(d);
1720     if (p->handle)
1721     {
1722         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1723     }
1724 }
1725
1726 Device Device::fromHandle(void* d)
1727 {
1728     Device device(d);
1729     return device;
1730 }
1731
1732 void* Device::ptr() const
1733 {
1734     return p ? p->handle : 0;
1735 }
1736
1737 String Device::name() const
1738 { return p ? p->name_ : String(); }
1739
1740 String Device::extensions() const
1741 { return p ? String(p->extensions_) : String(); }
1742
1743 bool Device::isExtensionSupported(const String& extensionName) const
1744 { return p ? p->isExtensionSupported(extensionName) : false; }
1745
1746 String Device::version() const
1747 { return p ? p->version_ : String(); }
1748
1749 String Device::vendorName() const
1750 { return p ? p->vendorName_ : String(); }
1751
1752 int Device::vendorID() const
1753 { return p ? p->vendorID_ : 0; }
1754
1755 String Device::OpenCL_C_Version() const
1756 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1757
1758 String Device::OpenCLVersion() const
1759 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1760
1761 int Device::deviceVersionMajor() const
1762 { return p ? p->deviceVersionMajor_ : 0; }
1763
1764 int Device::deviceVersionMinor() const
1765 { return p ? p->deviceVersionMinor_ : 0; }
1766
1767 String Device::driverVersion() const
1768 { return p ? p->driverVersion_ : String(); }
1769
1770 int Device::type() const
1771 { return p ? p->type_ : 0; }
1772
1773 int Device::addressBits() const
1774 { return p ? p->addressBits_ : 0; }
1775
1776 bool Device::available() const
1777 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1778
1779 bool Device::compilerAvailable() const
1780 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1781
1782 bool Device::linkerAvailable() const
1783 #ifdef CL_VERSION_1_2
1784 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1785 #else
1786 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1787 #endif
1788
1789 int Device::doubleFPConfig() const
1790 { return p ? p->doubleFPConfig_ : 0; }
1791
1792 int Device::singleFPConfig() const
1793 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1794
1795 int Device::halfFPConfig() const
1796 #ifdef CL_VERSION_1_2
1797 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1798 #else
1799 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1800 #endif
1801
1802 bool Device::endianLittle() const
1803 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1804
1805 bool Device::errorCorrectionSupport() const
1806 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1807
1808 int Device::executionCapabilities() const
1809 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1810
1811 size_t Device::globalMemCacheSize() const
1812 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1813
1814 int Device::globalMemCacheType() const
1815 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1816
1817 int Device::globalMemCacheLineSize() const
1818 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1819
1820 size_t Device::globalMemSize() const
1821 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1822
1823 size_t Device::localMemSize() const
1824 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1825
1826 int Device::localMemType() const
1827 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1828
1829 bool Device::hostUnifiedMemory() const
1830 { return p ? p->hostUnifiedMemory_ : false; }
1831
1832 bool Device::imageSupport() const
1833 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1834
1835 bool Device::imageFromBufferSupport() const
1836 {
1837     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1838 }
1839
1840 uint Device::imagePitchAlignment() const
1841 {
1842 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1843     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1844 #else
1845     return 0;
1846 #endif
1847 }
1848
1849 uint Device::imageBaseAddressAlignment() const
1850 {
1851 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1852     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1853 #else
1854     return 0;
1855 #endif
1856 }
1857
1858 size_t Device::image2DMaxWidth() const
1859 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1860
1861 size_t Device::image2DMaxHeight() const
1862 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1863
1864 size_t Device::image3DMaxWidth() const
1865 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1866
1867 size_t Device::image3DMaxHeight() const
1868 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1869
1870 size_t Device::image3DMaxDepth() const
1871 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1872
1873 size_t Device::imageMaxBufferSize() const
1874 #ifdef CL_VERSION_1_2
1875 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1876 #else
1877 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1878 #endif
1879
1880 size_t Device::imageMaxArraySize() const
1881 #ifdef CL_VERSION_1_2
1882 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1883 #else
1884 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1885 #endif
1886
1887 bool Device::intelSubgroupsSupport() const
1888 { return p ? p->intelSubgroupsSupport_ : false; }
1889
1890 int Device::maxClockFrequency() const
1891 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1892
1893 int Device::maxComputeUnits() const
1894 { return p ? p->maxComputeUnits_ : 0; }
1895
1896 int Device::maxConstantArgs() const
1897 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1898
1899 size_t Device::maxConstantBufferSize() const
1900 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1901
1902 size_t Device::maxMemAllocSize() const
1903 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1904
1905 size_t Device::maxParameterSize() const
1906 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1907
1908 int Device::maxReadImageArgs() const
1909 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1910
1911 int Device::maxWriteImageArgs() const
1912 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1913
1914 int Device::maxSamplers() const
1915 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1916
1917 size_t Device::maxWorkGroupSize() const
1918 { return p ? p->maxWorkGroupSize_ : 0; }
1919
1920 int Device::maxWorkItemDims() const
1921 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1922
1923 void Device::maxWorkItemSizes(size_t* sizes) const
1924 {
1925     if(p)
1926     {
1927         const int MAX_DIMS = 32;
1928         size_t retsz = 0;
1929         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1930                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1931     }
1932 }
1933
1934 int Device::memBaseAddrAlign() const
1935 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1936
1937 int Device::nativeVectorWidthChar() const
1938 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1939
1940 int Device::nativeVectorWidthShort() const
1941 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1942
1943 int Device::nativeVectorWidthInt() const
1944 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1945
1946 int Device::nativeVectorWidthLong() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1948
1949 int Device::nativeVectorWidthFloat() const
1950 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1951
1952 int Device::nativeVectorWidthDouble() const
1953 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1954
1955 int Device::nativeVectorWidthHalf() const
1956 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1957
1958 int Device::preferredVectorWidthChar() const
1959 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1960
1961 int Device::preferredVectorWidthShort() const
1962 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1963
1964 int Device::preferredVectorWidthInt() const
1965 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1966
1967 int Device::preferredVectorWidthLong() const
1968 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1969
1970 int Device::preferredVectorWidthFloat() const
1971 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1972
1973 int Device::preferredVectorWidthDouble() const
1974 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1975
1976 int Device::preferredVectorWidthHalf() const
1977 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1978
1979 size_t Device::printfBufferSize() const
1980 #ifdef CL_VERSION_1_2
1981 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1982 #else
1983 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1984 #endif
1985
1986
1987 size_t Device::profilingTimerResolution() const
1988 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1989
1990 const Device& Device::getDefault()
1991 {
1992     auto& c = OpenCLExecutionContext::getCurrent();
1993     if (!c.empty())
1994     {
1995         return c.getDevice();
1996     }
1997
1998     static Device dummy;
1999     return dummy;
2000 }
2001
2002 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2003
2004 template <typename Functor, typename ObjectType>
2005 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2006 {
2007     ::size_t required;
2008     cl_int err = f(obj, name, 0, NULL, &required);
2009     if (err != CL_SUCCESS)
2010         return err;
2011
2012     param.clear();
2013     if (required > 0)
2014     {
2015         AutoBuffer<char> buf(required + 1);
2016         char* ptr = buf.data(); // cleanup is not needed
2017         err = f(obj, name, required, ptr, NULL);
2018         if (err != CL_SUCCESS)
2019             return err;
2020         param = ptr;
2021     }
2022
2023     return CL_SUCCESS;
2024 }
2025
2026 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2027 {
2028     elems.clear();
2029     if (s.size() == 0)
2030         return;
2031     std::istringstream ss(s);
2032     std::string item;
2033     while (!ss.eof())
2034     {
2035         std::getline(ss, item, delim);
2036         elems.push_back(item);
2037     }
2038 }
2039
2040 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2041 // Sample: AMD:GPU:
2042 // Sample: AMD:GPU:Tahiti
2043 // Sample: :GPU|CPU: = '' = ':' = '::'
2044 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2045         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2046 {
2047     std::vector<std::string> parts;
2048     split(configurationStr, ':', parts);
2049     if (parts.size() > 3)
2050     {
2051         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2052         return false;
2053     }
2054     if (parts.size() > 2)
2055         deviceNameOrID = parts[2];
2056     if (parts.size() > 1)
2057     {
2058         split(parts[1], '|', deviceTypes);
2059     }
2060     if (parts.size() > 0)
2061     {
2062         platform = parts[0];
2063     }
2064     return true;
2065 }
2066
2067 #if defined WINRT || defined _WIN32_WCE
2068 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2069 {
2070     CV_UNUSED(configuration)
2071     return NULL;
2072 }
2073 #else
2074 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2075 {
2076     std::string platform, deviceName;
2077     std::vector<std::string> deviceTypes;
2078
2079     if (!configuration)
2080         configuration = getenv("OPENCV_OPENCL_DEVICE");
2081
2082     if (configuration &&
2083             (strcmp(configuration, "disabled") == 0 ||
2084              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2085             ))
2086         return NULL;
2087
2088     bool isID = false;
2089     int deviceID = -1;
2090     if (deviceName.length() == 1)
2091     // We limit ID range to 0..9, because we want to write:
2092     // - '2500' to mean i5-2500
2093     // - '8350' to mean AMD FX-8350
2094     // - '650' to mean GeForce 650
2095     // To extend ID range change condition to '> 0'
2096     {
2097         isID = true;
2098         for (size_t i = 0; i < deviceName.length(); i++)
2099         {
2100             if (!isdigit(deviceName[i]))
2101             {
2102                 isID = false;
2103                 break;
2104             }
2105         }
2106         if (isID)
2107         {
2108             deviceID = atoi(deviceName.c_str());
2109             if (deviceID < 0)
2110                 return NULL;
2111         }
2112     }
2113
2114     std::vector<cl_platform_id> platforms;
2115     {
2116         cl_uint numPlatforms = 0;
2117         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2118
2119         if (numPlatforms == 0)
2120             return NULL;
2121         platforms.resize((size_t)numPlatforms);
2122         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2123         platforms.resize(numPlatforms);
2124     }
2125
2126     int selectedPlatform = -1;
2127     if (platform.length() > 0)
2128     {
2129         for (size_t i = 0; i < platforms.size(); i++)
2130         {
2131             std::string name;
2132             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2133             if (name.find(platform) != std::string::npos)
2134             {
2135                 selectedPlatform = (int)i;
2136                 break;
2137             }
2138         }
2139         if (selectedPlatform == -1)
2140         {
2141             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2142             goto not_found;
2143         }
2144     }
2145     if (deviceTypes.size() == 0)
2146     {
2147         if (!isID)
2148         {
2149             deviceTypes.push_back("GPU");
2150             if (configuration)
2151                 deviceTypes.push_back("CPU");
2152         }
2153         else
2154             deviceTypes.push_back("ALL");
2155     }
2156     for (size_t t = 0; t < deviceTypes.size(); t++)
2157     {
2158         int deviceType = 0;
2159         std::string tempStrDeviceType = deviceTypes[t];
2160         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2161
2162         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2163             deviceType = Device::TYPE_GPU;
2164         else if (tempStrDeviceType == "cpu")
2165             deviceType = Device::TYPE_CPU;
2166         else if (tempStrDeviceType == "accelerator")
2167             deviceType = Device::TYPE_ACCELERATOR;
2168         else if (tempStrDeviceType == "all")
2169             deviceType = Device::TYPE_ALL;
2170         else
2171         {
2172             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2173             goto not_found;
2174         }
2175
2176         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2177         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2178                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2179                 i++)
2180         {
2181             cl_uint count = 0;
2182             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2183             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2184             {
2185                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2186             }
2187             if (count == 0)
2188                 continue;
2189             size_t base = devices.size();
2190             devices.resize(base + count);
2191             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2192             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2193             {
2194                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2195             }
2196         }
2197
2198         for (size_t i = (isID ? deviceID : 0);
2199              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2200              i++)
2201         {
2202             std::string name;
2203             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2204             cl_bool useGPU = true;
2205             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2206             {
2207                 cl_bool isIGPU = CL_FALSE;
2208                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2209                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2210             }
2211             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2212             {
2213                 // TODO check for OpenCL 1.1
2214                 return devices[i];
2215             }
2216         }
2217     }
2218
2219 not_found:
2220     if (!configuration)
2221         return NULL; // suppress messages on stderr
2222
2223     std::ostringstream msg;
2224     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2225         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2226         << "    Device types:";
2227     for (size_t t = 0; t < deviceTypes.size(); t++)
2228         msg << ' ' << deviceTypes[t];
2229
2230     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2231
2232     CV_LOG_ERROR(NULL, msg.str());
2233     return NULL;
2234 }
2235 #endif
2236
2237 #ifdef HAVE_OPENCL_SVM
2238 namespace svm {
2239
2240 enum AllocatorFlags { // don't use first 16 bits
2241         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2242         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2243         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2244         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2245         OPENCL_SVM_BUFFER_MAP = 4 << 16
2246 };
2247
2248 static bool checkForceSVMUmatUsage()
2249 {
2250     static bool initialized = false;
2251     static bool force = false;
2252     if (!initialized)
2253     {
2254         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2255         initialized = true;
2256     }
2257     return force;
2258 }
2259 static bool checkDisableSVMUMatUsage()
2260 {
2261     static bool initialized = false;
2262     static bool force = false;
2263     if (!initialized)
2264     {
2265         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2266         initialized = true;
2267     }
2268     return force;
2269 }
2270 static bool checkDisableSVM()
2271 {
2272     static bool initialized = false;
2273     static bool force = false;
2274     if (!initialized)
2275     {
2276         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2277         initialized = true;
2278     }
2279     return force;
2280 }
2281 // see SVMCapabilities
2282 static unsigned int getSVMCapabilitiesMask()
2283 {
2284     static bool initialized = false;
2285     static unsigned int mask = 0;
2286     if (!initialized)
2287     {
2288         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2289         if (envValue == NULL)
2290         {
2291             return ~0U; // all bits 1
2292         }
2293         mask = atoi(envValue);
2294         initialized = true;
2295     }
2296     return mask;
2297 }
2298 } // namespace
2299 #endif
2300
2301 static size_t getProgramCountLimit()
2302 {
2303     static bool initialized = false;
2304     static size_t count = 0;
2305     if (!initialized)
2306     {
2307         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2308         initialized = true;
2309     }
2310     return count;
2311 }
2312
2313 static int g_contextId = 0;
2314
2315 class OpenCLBufferPoolImpl;
2316 class OpenCLSVMBufferPoolImpl;
2317
2318 struct Context::Impl
2319 {
2320     static Context::Impl* get(Context& context) { return context.p; }
2321
2322     typedef std::deque<Context::Impl*> container_t;
2323     static container_t& getGlobalContainer()
2324     {
2325         // never delete this container (Impl lifetime is greater due to TLS storage)
2326         static container_t* g_contexts = new container_t();
2327         return *g_contexts;
2328     }
2329
2330 protected:
2331     Impl(const std::string& configuration_)
2332         : refcount(1)
2333         , contextId(CV_XADD(&g_contextId, 1))
2334         , configuration(configuration_)
2335         , handle(0)
2336 #ifdef HAVE_DIRECTX
2337         , p_directx_impl(0)
2338 #endif
2339 #ifdef HAVE_OPENCL_SVM
2340         , svmInitialized(false)
2341 #endif
2342     {
2343         if (!haveOpenCL())
2344             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2345
2346         cv::AutoLock lock(cv::getInitializationMutex());
2347         auto& container = getGlobalContainer();
2348         container.resize(std::max(container.size(), (size_t)contextId + 1));
2349         container[contextId] = this;
2350     }
2351
2352     ~Impl()
2353     {
2354 #ifdef _WIN32
2355         if (!cv::__termination)
2356 #endif
2357         {
2358             if (handle)
2359             {
2360                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2361                 handle = NULL;
2362             }
2363             devices.clear();
2364 #ifdef HAVE_DIRECTX
2365             directx::internal::deleteDirectXImpl(&p_directx_impl);
2366 #endif
2367         }
2368
2369         {
2370             cv::AutoLock lock(cv::getInitializationMutex());
2371             auto& container = getGlobalContainer();
2372             CV_CheckLT((size_t)contextId, container.size(), "");
2373             container[contextId] = NULL;
2374         }
2375     }
2376
2377     void init_device_list()
2378     {
2379         CV_Assert(handle);
2380
2381         cl_uint ndevices = 0;
2382         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2383         CV_Assert(ndevices > 0);
2384
2385         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2386         size_t devices_ret_size = 0;
2387         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2388         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2389
2390         devices.clear();
2391         for (unsigned i = 0; i < ndevices; i++)
2392         {
2393             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2394         }
2395     }
2396
2397     void __init_buffer_pools();  // w/o synchronization
2398     void _init_buffer_pools() const
2399     {
2400         if (!bufferPool_)
2401         {
2402             cv::AutoLock lock(cv::getInitializationMutex());
2403             if (!bufferPool_)
2404             {
2405                 const_cast<Impl*>(this)->__init_buffer_pools();
2406             }
2407         }
2408     }
2409 public:
2410     static Impl* findContext(const std::string& configuration)
2411     {
2412         CV_TRACE_FUNCTION();
2413         cv::AutoLock lock(cv::getInitializationMutex());
2414         auto& container = getGlobalContainer();
2415         if (configuration.empty() && !container.empty())
2416             return container[0];
2417         for (auto it = container.begin(); it != container.end(); ++it)
2418         {
2419             Impl* i = *it;
2420             if (i && i->configuration == configuration)
2421             {
2422                 return i;
2423             }
2424         }
2425         return NULL;
2426     }
2427
2428     static Impl* findOrCreateContext(const std::string& configuration_)
2429     {
2430         CV_TRACE_FUNCTION();
2431         std::string configuration = configuration_;
2432         if (configuration_.empty())
2433         {
2434             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2435             if (c)
2436                 configuration = c;
2437         }
2438         Impl* impl = findContext(configuration);
2439         if (impl)
2440         {
2441             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2442             impl->addref();
2443             return impl;
2444         }
2445
2446         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2447         if (d == NULL)
2448             return NULL;
2449
2450         impl = new Impl(configuration);
2451         try
2452         {
2453             impl->createFromDevice(d);
2454             if (impl->handle)
2455                 return impl;
2456             delete impl;
2457             return NULL;
2458         }
2459         catch (...)
2460         {
2461             delete impl;
2462             throw;
2463         }
2464     }
2465
2466     static Impl* findOrCreateContext(cl_context h)
2467     {
2468         CV_TRACE_FUNCTION();
2469
2470         CV_Assert(h);
2471
2472         std::string configuration = cv::format("@ctx-%p", (void*)h);
2473         Impl* impl = findContext(configuration);
2474         if (impl)
2475         {
2476             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2477             impl->addref();
2478             return impl;
2479         }
2480
2481         impl = new Impl(configuration);
2482         try
2483         {
2484             CV_OCL_CHECK(clRetainContext(h));
2485             impl->handle = h;
2486             impl->init_device_list();
2487             return impl;
2488         }
2489         catch (...)
2490         {
2491             delete impl;
2492             throw;
2493         }
2494     }
2495
2496     static Impl* findOrCreateContext(const ocl::Device& device)
2497     {
2498         CV_TRACE_FUNCTION();
2499
2500         CV_Assert(!device.empty());
2501         cl_device_id d = (cl_device_id)device.ptr();
2502         CV_Assert(d);
2503
2504         std::string configuration = cv::format("@dev-%p", (void*)d);
2505         Impl* impl = findContext(configuration);
2506         if (impl)
2507         {
2508             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2509             impl->addref();
2510             return impl;
2511         }
2512
2513         impl = new Impl(configuration);
2514         try
2515         {
2516             impl->createFromDevice(d);
2517             CV_Assert(impl->handle);
2518             return impl;
2519         }
2520         catch (...)
2521         {
2522             delete impl;
2523             throw;
2524         }
2525     }
2526
2527     void setDefault()
2528     {
2529         CV_TRACE_FUNCTION();
2530         cl_device_id d = selectOpenCLDevice();
2531
2532         if (d == NULL)
2533             return;
2534
2535         createFromDevice(d);
2536     }
2537
2538     void createFromDevice(cl_device_id d)
2539     {
2540         CV_TRACE_FUNCTION();
2541         CV_Assert(handle == NULL);
2542
2543         cl_platform_id pl = NULL;
2544         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2545
2546         cl_context_properties prop[] =
2547         {
2548             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2549             0
2550         };
2551
2552         // !!! in the current implementation force the number of devices to 1 !!!
2553         cl_uint nd = 1;
2554         cl_int status;
2555
2556         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2557         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2558
2559         bool ok = handle != 0 && status == CL_SUCCESS;
2560         if( ok )
2561         {
2562             devices.resize(nd);
2563             devices[0].set(d);
2564         }
2565         else
2566             handle = NULL;
2567     }
2568
2569     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2570
2571     void unloadProg(Program& prog)
2572     {
2573         cv::AutoLock lock(program_cache_mutex);
2574         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2575         {
2576               phash_t::iterator it = phash.find(*i);
2577               if (it != phash.end())
2578               {
2579                   if (it->second.ptr() == prog.ptr())
2580                   {
2581                       phash.erase(*i);
2582                       cacheList.erase(i);
2583                       return;
2584                   }
2585               }
2586         }
2587     }
2588
2589     std::string& getPrefixString()
2590     {
2591         if (prefix.empty())
2592         {
2593             cv::AutoLock lock(program_cache_mutex);
2594             if (prefix.empty())
2595             {
2596                 CV_Assert(!devices.empty());
2597                 const Device& d = devices[0];
2598                 int bits = d.addressBits();
2599                 if (bits > 0 && bits != 64)
2600                     prefix = cv::format("%d-bit--", bits);
2601                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2602                 // sanitize chars
2603                 for (size_t i = 0; i < prefix.size(); i++)
2604                 {
2605                     char c = prefix[i];
2606                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2607                     {
2608                         prefix[i] = '_';
2609                     }
2610                 }
2611             }
2612         }
2613         return prefix;
2614     }
2615
2616     std::string& getPrefixBase()
2617     {
2618         if (prefix_base.empty())
2619         {
2620             cv::AutoLock lock(program_cache_mutex);
2621             if (prefix_base.empty())
2622             {
2623                 const Device& d = devices[0];
2624                 int bits = d.addressBits();
2625                 if (bits > 0 && bits != 64)
2626                     prefix_base = cv::format("%d-bit--", bits);
2627                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2628                 // sanitize chars
2629                 for (size_t i = 0; i < prefix_base.size(); i++)
2630                 {
2631                     char c = prefix_base[i];
2632                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2633                     {
2634                         prefix_base[i] = '_';
2635                     }
2636                 }
2637             }
2638         }
2639         return prefix_base;
2640     }
2641
2642     IMPLEMENT_REFCOUNTABLE();
2643
2644     const int contextId;  // global unique ID
2645     const std::string configuration;
2646
2647     cl_context handle;
2648     std::vector<Device> devices;
2649
2650     std::string prefix;
2651     std::string prefix_base;
2652
2653     cv::Mutex program_cache_mutex;
2654     typedef std::map<std::string, Program> phash_t;
2655     phash_t phash;
2656     typedef std::list<cv::String> CacheList;
2657     CacheList cacheList;
2658
2659     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2660     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2661     OpenCLBufferPoolImpl& getBufferPool() const
2662     {
2663         _init_buffer_pools();
2664         CV_DbgAssert(bufferPool_);
2665         return *bufferPool_.get();
2666     }
2667     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2668     {
2669         _init_buffer_pools();
2670         CV_DbgAssert(bufferPoolHostPtr_);
2671         return *bufferPoolHostPtr_.get();
2672     }
2673
2674 #ifdef HAVE_DIRECTX
2675     directx::internal::OpenCLDirectXImpl* p_directx_impl;
2676
2677     directx::internal::OpenCLDirectXImpl* getDirectXImpl()
2678     {
2679         if (!p_directx_impl)
2680         {
2681             p_directx_impl = directx::internal::createDirectXImpl();
2682         }
2683         return p_directx_impl;
2684     }
2685 #endif
2686
2687 #ifdef HAVE_OPENCL_SVM
2688     bool svmInitialized;
2689     bool svmAvailable;
2690     bool svmEnabled;
2691     svm::SVMCapabilities svmCapabilities;
2692     svm::SVMFunctions svmFunctions;
2693
2694     void svmInit()
2695     {
2696         CV_Assert(handle != NULL);
2697         const Device& device = devices[0];
2698         cl_device_svm_capabilities deviceCaps = 0;
2699         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2700         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2701         if (status != CL_SUCCESS)
2702         {
2703             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2704             goto noSVM;
2705         }
2706         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2707         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2708         svmCapabilities.value_ =
2709                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2710                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2711                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2712                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2713         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2714         if (svmCapabilities.value_ == 0)
2715         {
2716             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2717             goto noSVM;
2718         }
2719         try
2720         {
2721             // Try OpenCL 2.0
2722             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2723             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2724             if (!ptr)
2725             {
2726                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2727                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2728             }
2729             try
2730             {
2731                 bool error = false;
2732                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2733                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2734                 {
2735                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2736                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2737                 }
2738                 clFinish(q);
2739                 try
2740                 {
2741                     ((int*)ptr)[0] = 100;
2742                 }
2743                 catch (...)
2744                 {
2745                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2746                     error = true;
2747                 }
2748                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2749                 {
2750                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2751                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2752                 }
2753                 clFinish(q);
2754                 if (error)
2755                 {
2756                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2757                 }
2758             }
2759             catch (...)
2760             {
2761                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2762                 clSVMFree(handle, ptr);
2763                 throw;
2764             }
2765             clSVMFree(handle, ptr);
2766             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2767             svmFunctions.fn_clSVMFree = clSVMFree;
2768             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2769             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2770             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2771             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2772             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2773             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2774             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2775         }
2776         catch (...)
2777         {
2778             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2779             try
2780             {
2781                 // Try HSA extension
2782                 String extensions = device.extensions();
2783                 if (extensions.find("cl_amd_svm") == String::npos)
2784                 {
2785                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2786                     goto noSVM;
2787                 }
2788                 cl_platform_id p = NULL;
2789                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2790                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2791                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2792                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2793                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2794                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2795                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2796                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2797                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2798                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2799                 CV_Assert(svmFunctions.isValid());
2800             }
2801             catch (...)
2802             {
2803                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2804                 goto noSVM;
2805             }
2806         }
2807
2808         svmAvailable = true;
2809         svmEnabled = !svm::checkDisableSVM();
2810         svmInitialized = true;
2811         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2812         return;
2813     noSVM:
2814         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2815         svmAvailable = false;
2816         svmEnabled = false;
2817         svmCapabilities.value_ = 0;
2818         svmInitialized = true;
2819         svmFunctions.fn_clSVMAlloc = NULL;
2820         return;
2821     }
2822
2823     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2824
2825     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2826     {
2827         _init_buffer_pools();
2828         CV_DbgAssert(bufferPoolSVM_);
2829         return *bufferPoolSVM_.get();
2830     }
2831 #endif
2832
2833     friend class Program;
2834 };
2835
2836
2837 Context::Context()
2838 {
2839     p = 0;
2840 }
2841
2842 Context::~Context()
2843 {
2844     release();
2845 }
2846
2847 // deprecated
2848 Context::Context(int dtype)
2849 {
2850     p = 0;
2851     create(dtype);
2852 }
2853
2854 void Context::release()
2855 {
2856     if (p)
2857     {
2858         p->release();
2859         p = NULL;
2860     }
2861 }
2862
2863 bool Context::create()
2864 {
2865     release();
2866     if (!haveOpenCL())
2867         return false;
2868     p = Impl::findOrCreateContext(std::string());
2869     if (p && p->handle)
2870         return true;
2871     release();
2872     return false;
2873 }
2874
2875 // deprecated
2876 bool Context::create(int dtype)
2877 {
2878     if( !haveOpenCL() )
2879         return false;
2880     release();
2881     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2882     {
2883         p = Impl::findOrCreateContext("");
2884     }
2885     else if (dtype == CL_DEVICE_TYPE_GPU)
2886     {
2887         p = Impl::findOrCreateContext(":GPU:");
2888     }
2889     else if (dtype == CL_DEVICE_TYPE_CPU)
2890     {
2891         p = Impl::findOrCreateContext(":CPU:");
2892     }
2893     else
2894     {
2895         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2896     }
2897     if (p && !p->handle)
2898     {
2899         release();
2900     }
2901     return p != 0;
2902 }
2903
2904 Context::Context(const Context& c)
2905 {
2906     p = (Impl*)c.p;
2907     if(p)
2908         p->addref();
2909 }
2910
2911 Context& Context::operator = (const Context& c)
2912 {
2913     Impl* newp = (Impl*)c.p;
2914     if(newp)
2915         newp->addref();
2916     if(p)
2917         p->release();
2918     p = newp;
2919     return *this;
2920 }
2921
2922 void* Context::ptr() const
2923 {
2924     return p == NULL ? NULL : p->handle;
2925 }
2926
2927 size_t Context::ndevices() const
2928 {
2929     return p ? p->devices.size() : 0;
2930 }
2931
2932 Device& Context::device(size_t idx) const
2933 {
2934     static Device dummy;
2935     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2936 }
2937
2938 Context& Context::getDefault(bool initialize)
2939 {
2940     auto& c = OpenCLExecutionContext::getCurrent();
2941     if (!c.empty())
2942     {
2943         auto& ctx = c.getContext();
2944         return ctx;
2945     }
2946
2947     CV_UNUSED(initialize);
2948     static Context dummy;
2949     return dummy;
2950 }
2951
2952 Program Context::getProg(const ProgramSource& prog,
2953                          const String& buildopts, String& errmsg)
2954 {
2955     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2956 }
2957
2958 void Context::unloadProg(Program& prog)
2959 {
2960     if (p)
2961         p->unloadProg(prog);
2962 }
2963
2964 /* static */
2965 Context Context::fromHandle(void* context)
2966 {
2967     Context ctx;
2968     ctx.p = Impl::findOrCreateContext((cl_context)context);
2969     return ctx;
2970 }
2971
2972 /* static */
2973 Context Context::fromDevice(const ocl::Device& device)
2974 {
2975     Context ctx;
2976     ctx.p = Impl::findOrCreateContext(device);
2977     return ctx;
2978 }
2979
2980 /* static */
2981 Context Context::create(const std::string& configuration)
2982 {
2983     Context ctx;
2984     ctx.p = Impl::findOrCreateContext(configuration);
2985     return ctx;
2986 }
2987
2988 #ifdef HAVE_OPENCL_SVM
2989 bool Context::useSVM() const
2990 {
2991     Context::Impl* i = p;
2992     CV_Assert(i);
2993     if (!i->svmInitialized)
2994         i->svmInit();
2995     return i->svmEnabled;
2996 }
2997 void Context::setUseSVM(bool enabled)
2998 {
2999     Context::Impl* i = p;
3000     CV_Assert(i);
3001     if (!i->svmInitialized)
3002         i->svmInit();
3003     if (enabled && !i->svmAvailable)
3004     {
3005         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
3006     }
3007     i->svmEnabled = enabled;
3008 }
3009 #else
3010 bool Context::useSVM() const { return false; }
3011 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
3012 #endif
3013
3014 #ifdef HAVE_OPENCL_SVM
3015 namespace svm {
3016
3017 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
3018 {
3019     Context::Impl* i = context.p;
3020     CV_Assert(i);
3021     if (!i->svmInitialized)
3022         i->svmInit();
3023     return i->svmCapabilities;
3024 }
3025
3026 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3027 {
3028     Context::Impl* i = context.p;
3029     CV_Assert(i);
3030     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3031     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3032     return &i->svmFunctions;
3033 }
3034
3035 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3036 {
3037     if (checkForceSVMUmatUsage())
3038         return true;
3039     if (checkDisableSVMUMatUsage())
3040         return false;
3041     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3042         return true;
3043     return false; // don't use SVM by default
3044 }
3045
3046 } // namespace cv::ocl::svm
3047 #endif // HAVE_OPENCL_SVM
3048
3049
3050 static void get_platform_name(cl_platform_id id, String& name)
3051 {
3052     // get platform name string length
3053     size_t sz = 0;
3054     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3055
3056     // get platform name string
3057     AutoBuffer<char> buf(sz + 1);
3058     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3059
3060     // just in case, ensure trailing zero for ASCIIZ string
3061     buf[sz] = 0;
3062
3063     name = buf.data();
3064 }
3065
3066 /*
3067 // Attaches OpenCL context to OpenCV
3068 */
3069 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3070 {
3071     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3072     ctx.bind();
3073 }
3074
3075 /* static */
3076 OpenCLExecutionContext OpenCLExecutionContext::create(
3077         const std::string& platformName, void* platformID, void* context, void* deviceID
3078 )
3079 {
3080     if (!haveOpenCL())
3081         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3082
3083     cl_uint cnt = 0;
3084     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3085
3086     if (cnt == 0)
3087         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3088
3089     std::vector<cl_platform_id> platforms(cnt);
3090
3091     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3092
3093     bool platformAvailable = false;
3094
3095     // check if external platformName contained in list of available platforms in OpenCV
3096     for (unsigned int i = 0; i < cnt; i++)
3097     {
3098         String availablePlatformName;
3099         get_platform_name(platforms[i], availablePlatformName);
3100         // external platform is found in the list of available platforms
3101         if (platformName == availablePlatformName)
3102         {
3103             platformAvailable = true;
3104             break;
3105         }
3106     }
3107
3108     if (!platformAvailable)
3109         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3110
3111     // check if platformID corresponds to platformName
3112     String actualPlatformName;
3113     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3114     if (platformName != actualPlatformName)
3115         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3116
3117     OpenCLExecutionContext ctx;
3118     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3119     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3120     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3121     return ctx;
3122 }
3123
3124 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3125 {
3126     // internal call, less checks
3127     cl_platform_id platformID = (cl_platform_id)_platform;
3128     cl_context context = (cl_context)_context;
3129     cl_device_id deviceID = (cl_device_id)_device;
3130
3131     std::string platformName = PlatformInfo(&platformID).name();
3132
3133     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3134     CV_Assert(!clExecCtx.empty());
3135     ctx = clExecCtx.getContext();
3136 }
3137
3138 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3139
3140 struct Queue::Impl
3141 {
3142     inline void __init()
3143     {
3144         refcount = 1;
3145         handle = 0;
3146         isProfilingQueue_ = false;
3147     }
3148
3149     Impl(cl_command_queue q)
3150     {
3151         __init();
3152         handle = q;
3153
3154         cl_command_queue_properties props = 0;
3155         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3156         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3157     }
3158
3159     Impl(cl_command_queue q, bool isProfilingQueue)
3160     {
3161         __init();
3162         handle = q;
3163         isProfilingQueue_ = isProfilingQueue;
3164     }
3165
3166     Impl(const Context& c, const Device& d, bool withProfiling = false)
3167     {
3168         __init();
3169
3170         const Context* pc = &c;
3171         cl_context ch = (cl_context)pc->ptr();
3172         if( !ch )
3173         {
3174             pc = &Context::getDefault();
3175             ch = (cl_context)pc->ptr();
3176         }
3177         cl_device_id dh = (cl_device_id)d.ptr();
3178         if( !dh )
3179             dh = (cl_device_id)pc->device(0).ptr();
3180         cl_int retval = 0;
3181         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3182         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3183         isProfilingQueue_ = withProfiling;
3184     }
3185
3186     ~Impl()
3187     {
3188 #ifdef _WIN32
3189         if (!cv::__termination)
3190 #endif
3191         {
3192             if(handle)
3193             {
3194                 CV_OCL_DBG_CHECK(clFinish(handle));
3195                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3196                 handle = NULL;
3197             }
3198         }
3199     }
3200
3201     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3202     {
3203         if (isProfilingQueue_)
3204             return self;
3205
3206         if (profiling_queue_.ptr())
3207             return profiling_queue_;
3208
3209         cl_context ctx = 0;
3210         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3211
3212         cl_device_id device = 0;
3213         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3214
3215         cl_int result = CL_SUCCESS;
3216         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3217         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3218         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3219
3220         Queue queue;
3221         queue.p = new Impl(q, true);
3222         profiling_queue_ = queue;
3223
3224         return profiling_queue_;
3225     }
3226
3227     IMPLEMENT_REFCOUNTABLE();
3228
3229     cl_command_queue handle;
3230     bool isProfilingQueue_;
3231     cv::ocl::Queue profiling_queue_;
3232 };
3233
3234 Queue::Queue()
3235 {
3236     p = 0;
3237 }
3238
3239 Queue::Queue(const Context& c, const Device& d)
3240 {
3241     p = 0;
3242     create(c, d);
3243 }
3244
3245 Queue::Queue(const Queue& q)
3246 {
3247     p = q.p;
3248     if(p)
3249         p->addref();
3250 }
3251
3252 Queue& Queue::operator = (const Queue& q)
3253 {
3254     Impl* newp = (Impl*)q.p;
3255     if(newp)
3256         newp->addref();
3257     if(p)
3258         p->release();
3259     p = newp;
3260     return *this;
3261 }
3262
3263 Queue::~Queue()
3264 {
3265     if(p)
3266         p->release();
3267 }
3268
3269 bool Queue::create(const Context& c, const Device& d)
3270 {
3271     if(p)
3272         p->release();
3273     p = new Impl(c, d);
3274     return p->handle != 0;
3275 }
3276
3277 void Queue::finish()
3278 {
3279     if(p && p->handle)
3280     {
3281         CV_OCL_DBG_CHECK(clFinish(p->handle));
3282     }
3283 }
3284
3285 const Queue& Queue::getProfilingQueue() const
3286 {
3287     CV_Assert(p);
3288     return p->getProfilingQueue(*this);
3289 }
3290
3291 void* Queue::ptr() const
3292 {
3293     return p ? p->handle : 0;
3294 }
3295
3296 Queue& Queue::getDefault()
3297 {
3298     auto& c = OpenCLExecutionContext::getCurrent();
3299     if (!c.empty())
3300     {
3301         auto& q = c.getQueue();
3302         return q;
3303     }
3304     static Queue dummy;
3305     return dummy;
3306 }
3307
3308 static cl_command_queue getQueue(const Queue& q)
3309 {
3310     cl_command_queue qq = (cl_command_queue)q.ptr();
3311     if(!qq)
3312         qq = (cl_command_queue)Queue::getDefault().ptr();
3313     return qq;
3314 }
3315
3316 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3317
3318 KernelArg::KernelArg()
3319     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3320 {
3321 }
3322
3323 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3324     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3325 {
3326     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3327 }
3328
3329 KernelArg KernelArg::Constant(const Mat& m)
3330 {
3331     CV_Assert(m.isContinuous());
3332     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3333 }
3334
3335 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3336
3337 struct Kernel::Impl
3338 {
3339     Impl(const char* kname, const Program& prog) :
3340         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3341     {
3342         cl_program ph = (cl_program)prog.ptr();
3343         cl_int retval = 0;
3344         name = kname;
3345         if (ph)
3346         {
3347             handle = clCreateKernel(ph, kname, &retval);
3348             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3349         }
3350         for( int i = 0; i < MAX_ARRS; i++ )
3351             u[i] = 0;
3352         haveTempDstUMats = false;
3353         haveTempSrcUMats = false;
3354     }
3355
3356     void cleanupUMats()
3357     {
3358         for( int i = 0; i < MAX_ARRS; i++ )
3359             if( u[i] )
3360             {
3361                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3362                 {
3363                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3364                     u[i]->currAllocator->deallocate(u[i]);
3365                 }
3366                 u[i] = 0;
3367             }
3368         nu = 0;
3369         haveTempDstUMats = false;
3370         haveTempSrcUMats = false;
3371     }
3372
3373     void addUMat(const UMat& m, bool dst)
3374     {
3375         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3376         u[nu] = m.u;
3377         CV_XADD(&m.u->urefcount, 1);
3378         nu++;
3379         if(dst && m.u->tempUMat())
3380             haveTempDstUMats = true;
3381         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3382             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3383     }
3384
3385     void addImage(const Image2D& image)
3386     {
3387         images.push_back(image);
3388     }
3389
3390     void finit(cl_event e)
3391     {
3392         CV_UNUSED(e);
3393         cleanupUMats();
3394         images.clear();
3395         isInProgress = false;
3396         release();
3397     }
3398
3399     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3400             bool sync, int64* timeNS, const Queue& q);
3401
3402     ~Impl()
3403     {
3404         if(handle)
3405         {
3406             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3407         }
3408     }
3409
3410     IMPLEMENT_REFCOUNTABLE();
3411
3412     cv::String name;
3413     cl_kernel handle;
3414     enum { MAX_ARRS = 16 };
3415     UMatData* u[MAX_ARRS];
3416     bool isInProgress;
3417     bool isAsyncRun;  // true if kernel was scheduled in async mode
3418     int nu;
3419     std::list<Image2D> images;
3420     bool haveTempDstUMats;
3421     bool haveTempSrcUMats;
3422 };
3423
3424 }} // namespace cv::ocl
3425
3426 extern "C" {
3427
3428 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3429 {
3430     try
3431     {
3432         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3433     }
3434     catch (const cv::Exception& exc)
3435     {
3436         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3437     }
3438     catch (const std::exception& exc)
3439     {
3440         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3441     }
3442     catch (...)
3443     {
3444         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3445     }
3446 }
3447
3448 }
3449
3450 namespace cv { namespace ocl {
3451
3452 Kernel::Kernel()
3453 {
3454     p = 0;
3455 }
3456
3457 Kernel::Kernel(const char* kname, const Program& prog)
3458 {
3459     p = 0;
3460     create(kname, prog);
3461 }
3462
3463 Kernel::Kernel(const char* kname, const ProgramSource& src,
3464                const String& buildopts, String* errmsg)
3465 {
3466     p = 0;
3467     create(kname, src, buildopts, errmsg);
3468 }
3469
3470 Kernel::Kernel(const Kernel& k)
3471 {
3472     p = k.p;
3473     if(p)
3474         p->addref();
3475 }
3476
3477 Kernel& Kernel::operator = (const Kernel& k)
3478 {
3479     Impl* newp = (Impl*)k.p;
3480     if(newp)
3481         newp->addref();
3482     if(p)
3483         p->release();
3484     p = newp;
3485     return *this;
3486 }
3487
3488 Kernel::~Kernel()
3489 {
3490     if(p)
3491         p->release();
3492 }
3493
3494 bool Kernel::create(const char* kname, const Program& prog)
3495 {
3496     if(p)
3497         p->release();
3498     p = new Impl(kname, prog);
3499     if(p->handle == 0)
3500     {
3501         p->release();
3502         p = 0;
3503     }
3504 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3505     CV_Assert(p);
3506 #endif
3507     return p != 0;
3508 }
3509
3510 bool Kernel::create(const char* kname, const ProgramSource& src,
3511                     const String& buildopts, String* errmsg)
3512 {
3513     if(p)
3514     {
3515         p->release();
3516         p = 0;
3517     }
3518     String tempmsg;
3519     if( !errmsg ) errmsg = &tempmsg;
3520     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3521     return create(kname, prog);
3522 }
3523
3524 void* Kernel::ptr() const
3525 {
3526     return p ? p->handle : 0;
3527 }
3528
3529 bool Kernel::empty() const
3530 {
3531     return ptr() == 0;
3532 }
3533
3534 static cv::String dumpValue(size_t sz, const void* p)
3535 {
3536     if (sz == 4)
3537         return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
3538     if (sz == 8)
3539         return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p);
3540     return cv::format("%p", p);
3541 }
3542
3543 int Kernel::set(int i, const void* value, size_t sz)
3544 {
3545     if (!p || !p->handle)
3546         return -1;
3547     if (i < 0)
3548         return i;
3549     if( i == 0 )
3550         p->cleanupUMats();
3551
3552     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3553     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str());
3554     if (retval != CL_SUCCESS)
3555         return -1;
3556     return i+1;
3557 }
3558
3559 int Kernel::set(int i, const Image2D& image2D)
3560 {
3561     p->addImage(image2D);
3562     cl_mem h = (cl_mem)image2D.ptr();
3563     return set(i, &h, sizeof(h));
3564 }
3565
3566 int Kernel::set(int i, const UMat& m)
3567 {
3568     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3569 }
3570
3571 int Kernel::set(int i, const KernelArg& arg)
3572 {
3573     if( !p || !p->handle )
3574         return -1;
3575     if (i < 0)
3576     {
3577         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3578                 p->name.c_str(), (int)i));
3579         return i;
3580     }
3581     if( i == 0 )
3582         p->cleanupUMats();
3583     cl_int status = 0;
3584     if( arg.m )
3585     {
3586         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3587                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3588         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3589         if (ptronly && arg.m->empty())
3590         {
3591             cl_mem h_null = (cl_mem)NULL;
3592             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3593             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3594             return i + 1;
3595         }
3596         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3597
3598         if (!h)
3599         {
3600             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3601                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3602             p->release();
3603             p = 0;
3604             return -1;
3605         }
3606
3607 #ifdef HAVE_OPENCL_SVM
3608         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3609         {
3610             const Context& ctx = Context::getDefault();
3611             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3612             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3613             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3614 #if 1 // TODO
3615             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3616 #else
3617             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3618 #endif
3619             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3620         }
3621         else
3622 #endif
3623         {
3624             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3625             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3626         }
3627
3628         if (ptronly)
3629         {
3630             i++;
3631         }
3632         else if( arg.m->dims <= 2 )
3633         {
3634             UMat2D u2d(*arg.m);
3635             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3636             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3637             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3638             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3639             i += 3;
3640
3641             if( !(arg.flags & KernelArg::NO_SIZE) )
3642             {
3643                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3644                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3645                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3646                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3647                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3648                 i += 2;
3649             }
3650         }
3651         else
3652         {
3653             UMat3D u3d(*arg.m);
3654             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3655             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3656             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3657             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3658             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3659             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3660             i += 4;
3661             if( !(arg.flags & KernelArg::NO_SIZE) )
3662             {
3663                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3664                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3665                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3666                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3667                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3668                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3669                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3670                 i += 3;
3671             }
3672         }
3673         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3674         return i;
3675     }
3676     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3677     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3678     return i+1;
3679 }
3680
3681 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3682                  bool sync, const Queue& q)
3683 {
3684     if (!p)
3685         return false;
3686
3687     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3688     size_t total = 1;
3689     CV_Assert(_globalsize != NULL);
3690     for (int i = 0; i < dims; i++)
3691     {
3692         size_t val = _localsize ? _localsize[i] :
3693             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3694         CV_Assert( val > 0 );
3695         total *= _globalsize[i];
3696         if (_globalsize[i] == 1 && !_localsize)
3697             val = 1;
3698         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3699     }
3700     CV_Assert(total > 0);
3701
3702     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3703 }
3704
3705
3706 static bool isRaiseErrorOnReuseAsyncKernel()
3707 {
3708     static bool initialized = false;
3709     static bool value = false;
3710     if (!initialized)
3711     {
3712         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3713         initialized = true;
3714     }
3715     return value;
3716 }
3717
3718 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3719         bool sync, int64* timeNS, const Queue& q)
3720 {
3721     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3722
3723     if (!handle)
3724     {
3725         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3726         return false;
3727     }
3728
3729     if (isAsyncRun)
3730     {
3731         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3732         if (isRaiseErrorOnReuseAsyncKernel())
3733             CV_Assert(0);
3734         return false;  // OpenCV 5.0: raise error
3735     }
3736     isAsyncRun = !sync;
3737
3738     if (isInProgress)
3739     {
3740         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3741         if (isRaiseErrorOnReuseAsyncKernel())
3742             CV_Assert(0);
3743         return false;  // OpenCV 5.0: raise error
3744     }
3745
3746     cl_command_queue qq = getQueue(q);
3747     if (haveTempDstUMats)
3748         sync = true;
3749     if (haveTempSrcUMats)
3750         sync = true;
3751     if (timeNS)
3752         sync = true;
3753     cl_event asyncEvent = 0;
3754     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3755                                            NULL, globalsize, localsize, 0, 0,
3756                                            (sync && !timeNS) ? 0 : &asyncEvent);
3757 #if !CV_OPENCL_SHOW_RUN_KERNELS
3758     if (retval != CL_SUCCESS)
3759 #endif
3760     {
3761         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3762                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3763                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3764                         sync ? "true" : "false"
3765                         );
3766         if (retval != CL_SUCCESS)
3767         {
3768             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3769         }
3770 #if CV_OPENCL_TRACE_CHECK
3771         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3772 #else
3773         printf("%s\n", msg.c_str());
3774         fflush(stdout);
3775 #endif
3776     }
3777     if (sync || retval != CL_SUCCESS)
3778     {
3779         CV_OCL_DBG_CHECK(clFinish(qq));
3780         if (timeNS)
3781         {
3782             if (retval == CL_SUCCESS)
3783             {
3784                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3785                 cl_ulong startTime, stopTime;
3786                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3787                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3788                 *timeNS = (int64)(stopTime - startTime);
3789             }
3790             else
3791             {
3792                 *timeNS = -1;
3793             }
3794         }
3795         cleanupUMats();
3796     }
3797     else
3798     {
3799         addref();
3800         isInProgress = true;
3801         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3802     }
3803     if (asyncEvent)
3804         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3805     return retval == CL_SUCCESS;
3806 }
3807
3808 bool Kernel::runTask(bool sync, const Queue& q)
3809 {
3810     if(!p || !p->handle || p->isInProgress)
3811         return false;
3812
3813     cl_command_queue qq = getQueue(q);
3814     cl_event asyncEvent = 0;
3815     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3816     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3817     if (sync || retval != CL_SUCCESS)
3818     {
3819         CV_OCL_DBG_CHECK(clFinish(qq));
3820         p->cleanupUMats();
3821     }
3822     else
3823     {
3824         p->addref();
3825         p->isInProgress = true;
3826         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3827     }
3828     if (asyncEvent)
3829         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3830     return retval == CL_SUCCESS;
3831 }
3832
3833 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3834 {
3835     CV_Assert(p && p->handle && !p->isInProgress);
3836     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3837     CV_Assert(q.ptr());
3838     q.finish(); // call clFinish() on base queue
3839     Queue profilingQueue = q.getProfilingQueue();
3840     int64 timeNs = -1;
3841     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3842     return res ? timeNs : -1;
3843 }
3844
3845 size_t Kernel::workGroupSize() const
3846 {
3847     if(!p || !p->handle)
3848         return 0;
3849     size_t val = 0, retsz = 0;
3850     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3851     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3852     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3853     return status == CL_SUCCESS ? val : 0;
3854 }
3855
3856 size_t Kernel::preferedWorkGroupSizeMultiple() const
3857 {
3858     if(!p || !p->handle)
3859         return 0;
3860     size_t val = 0, retsz = 0;
3861     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3862     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3863     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3864     return status == CL_SUCCESS ? val : 0;
3865 }
3866
3867 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3868 {
3869     if(!p || !p->handle || !wsz)
3870         return 0;
3871     size_t retsz = 0;
3872     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3873     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3874     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3875     return status == CL_SUCCESS;
3876 }
3877
3878 size_t Kernel::localMemSize() const
3879 {
3880     if(!p || !p->handle)
3881         return 0;
3882     size_t retsz = 0;
3883     cl_ulong val = 0;
3884     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3885     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3886     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3887     return status == CL_SUCCESS ? (size_t)val : 0;
3888 }
3889
3890
3891
3892 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3893
3894 struct ProgramSource::Impl
3895 {
3896     IMPLEMENT_REFCOUNTABLE();
3897
3898     enum KIND {
3899         PROGRAM_SOURCE_CODE = 0,
3900         PROGRAM_BINARIES,
3901         PROGRAM_SPIR,
3902         PROGRAM_SPIRV
3903     } kind_;
3904
3905     Impl(const String& src)
3906     {
3907         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3908         initFromSource(src, cv::String());
3909     }
3910     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3911     {
3912         init(PROGRAM_SOURCE_CODE, module, name);
3913         initFromSource(codeStr, codeHash);
3914     }
3915
3916     /// reset fields
3917     void init(enum KIND kind, const String& module, const String& name)
3918     {
3919         refcount = 1;
3920         kind_ = kind;
3921         module_ = module;
3922         name_ = name;
3923
3924         sourceAddr_ = NULL;
3925         sourceSize_ = 0;
3926         isHashUpdated = false;
3927     }
3928
3929     void initFromSource(const String& codeStr, const String& codeHash)
3930     {
3931         codeStr_ = codeStr;
3932         sourceHash_ = codeHash;
3933         if (sourceHash_.empty())
3934         {
3935             updateHash();
3936         }
3937         else
3938         {
3939             isHashUpdated = true;
3940         }
3941     }
3942
3943     void updateHash(const char* hashStr = NULL)
3944     {
3945         if (hashStr)
3946         {
3947             sourceHash_ = cv::String(hashStr);
3948             isHashUpdated = true;
3949             return;
3950         }
3951         uint64 hash = 0;
3952         switch (kind_)
3953         {
3954         case PROGRAM_SOURCE_CODE:
3955             if (sourceAddr_)
3956             {
3957                 CV_Assert(codeStr_.empty());
3958                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3959             }
3960             else
3961             {
3962                 CV_Assert(!codeStr_.empty());
3963                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3964             }
3965             break;
3966         case PROGRAM_BINARIES:
3967         case PROGRAM_SPIR:
3968         case PROGRAM_SPIRV:
3969             hash = crc64(sourceAddr_, sourceSize_);
3970             break;
3971         default:
3972             CV_Error(Error::StsInternal, "Internal error");
3973         }
3974         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3975         isHashUpdated = true;
3976     }
3977
3978     Impl(enum KIND kind,
3979             const String& module, const String& name,
3980             const unsigned char* binary, const size_t size,
3981             const cv::String& buildOptions = cv::String())
3982     {
3983         init(kind, module, name);
3984
3985         sourceAddr_ = binary;
3986         sourceSize_ = size;
3987
3988         buildOptions_ = buildOptions;
3989     }
3990
3991     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3992             const char* sourceCodeStaticStr, const char* hashStaticStr,
3993             const cv::String& buildOptions)
3994     {
3995         ProgramSource result;
3996         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3997                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3998         result.p->updateHash(hashStaticStr);
3999         return result;
4000     }
4001
4002     static ProgramSource fromBinary(const String& module, const String& name,
4003             const unsigned char* binary, const size_t size,
4004             const cv::String& buildOptions)
4005     {
4006         ProgramSource result;
4007         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
4008         return result;
4009     }
4010
4011     static ProgramSource fromSPIR(const String& module, const String& name,
4012             const unsigned char* binary, const size_t size,
4013             const cv::String& buildOptions)
4014     {
4015         ProgramSource result;
4016         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
4017         return result;
4018     }
4019
4020     String module_;
4021     String name_;
4022
4023     // TODO std::vector<ProgramSource> includes_;
4024     String codeStr_; // PROGRAM_SOURCE_CODE only
4025
4026     const unsigned char* sourceAddr_;
4027     size_t sourceSize_;
4028
4029     cv::String buildOptions_;
4030
4031     String sourceHash_;
4032     bool isHashUpdated;
4033
4034     friend struct Program::Impl;
4035     friend struct internal::ProgramEntry;
4036     friend struct Context::Impl;
4037 };
4038
4039
4040 ProgramSource::ProgramSource()
4041 {
4042     p = 0;
4043 }
4044
4045 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4046 {
4047     p = new Impl(module, name, codeStr, codeHash);
4048 }
4049
4050 ProgramSource::ProgramSource(const char* prog)
4051 {
4052     p = new Impl(prog);
4053 }
4054
4055 ProgramSource::ProgramSource(const String& prog)
4056 {
4057     p = new Impl(prog);
4058 }
4059
4060 ProgramSource::~ProgramSource()
4061 {
4062     if(p)
4063         p->release();
4064 }
4065
4066 ProgramSource::ProgramSource(const ProgramSource& prog)
4067 {
4068     p = prog.p;
4069     if(p)
4070         p->addref();
4071 }
4072
4073 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4074 {
4075     Impl* newp = (Impl*)prog.p;
4076     if(newp)
4077         newp->addref();
4078     if(p)
4079         p->release();
4080     p = newp;
4081     return *this;
4082 }
4083
4084 const String& ProgramSource::source() const
4085 {
4086     CV_Assert(p);
4087     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4088     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4089     return p->codeStr_;
4090 }
4091
4092 ProgramSource::hash_t ProgramSource::hash() const
4093 {
4094     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4095 }
4096
4097 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4098         const unsigned char* binary, const size_t size,
4099         const cv::String& buildOptions)
4100 {
4101     CV_Assert(binary);
4102     CV_Assert(size > 0);
4103     return Impl::fromBinary(module, name, binary, size, buildOptions);
4104 }
4105
4106 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4107         const unsigned char* binary, const size_t size,
4108         const cv::String& buildOptions)
4109 {
4110     CV_Assert(binary);
4111     CV_Assert(size > 0);
4112     return Impl::fromBinary(module, name, binary, size, buildOptions);
4113 }
4114
4115
4116 internal::ProgramEntry::operator ProgramSource&() const
4117 {
4118     if (this->pProgramSource == NULL)
4119     {
4120         cv::AutoLock lock(cv::getInitializationMutex());
4121         if (this->pProgramSource == NULL)
4122         {
4123             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4124             ProgramSource* ptr = new ProgramSource(ps);
4125             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4126         }
4127     }
4128     return *this->pProgramSource;
4129 }
4130
4131
4132
4133 /////////////////////////////////////////// Program /////////////////////////////////////////////
4134
4135 static
4136 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4137 {
4138     if (b.empty())
4139         return a;
4140     if (a.empty())
4141         return b;
4142     if (b[0] == ' ')
4143         return a + b;
4144     return a + (cv::String(" ") + b);
4145 }
4146
4147 struct Program::Impl
4148 {
4149     IMPLEMENT_REFCOUNTABLE();
4150
4151     Impl(const ProgramSource& src,
4152          const String& _buildflags, String& errmsg) :
4153          refcount(1),
4154          handle(NULL),
4155          buildflags(_buildflags)
4156     {
4157         const ProgramSource::Impl* src_ = src.getImpl();
4158         CV_Assert(src_);
4159         sourceModule_ = src_->module_;
4160         sourceName_ = src_->name_;
4161         const Context ctx = Context::getDefault();
4162         Device device = ctx.device(0);
4163         if (ctx.ptr() == NULL || device.ptr() == NULL)
4164             return;
4165         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4166         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4167         {
4168             if (device.isAMD())
4169                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4170             else if (device.isIntel())
4171                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4172             const String param_buildExtraOptions = getBuildExtraOptions();
4173             if (!param_buildExtraOptions.empty())
4174                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4175         }
4176         compile(ctx, src_, errmsg);
4177     }
4178
4179     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4180     {
4181         CV_Assert(ctx.getImpl());
4182         CV_Assert(src_);
4183
4184         // We don't cache OpenCL binaries
4185         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4186         {
4187             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4188             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4189             return isLoaded;
4190         }
4191         return compileWithCache(ctx, src_, errmsg);
4192     }
4193
4194     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4195     {
4196         CV_Assert(ctx.getImpl());
4197         CV_Assert(src_);
4198         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4199
4200 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4201         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4202         const std::string base_dir = config.prepareCacheDirectoryForContext(
4203                 ctx.getImpl()->getPrefixString(),
4204                 ctx.getImpl()->getPrefixBase()
4205         );
4206         const String& hash_str = src_->sourceHash_;
4207         cv::String fname;
4208         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4209         {
4210             CV_Assert(!hash_str.empty());
4211             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4212             fname = utils::fs::join(base_dir, fname);
4213         }
4214         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4215         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4216         {
4217             try
4218             {
4219                 std::vector<char> binaryBuf;
4220                 bool res = false;
4221                 {
4222                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4223                     BinaryProgramFile file(fname, hash_str.c_str());
4224                     res = file.read(buildflags, binaryBuf);
4225                 }
4226                 if (res)
4227                 {
4228                     CV_Assert(!binaryBuf.empty());
4229                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4230                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4231                     if (isLoaded)
4232                         return true;
4233                 }
4234             }
4235             catch (const cv::Exception& e)
4236             {
4237                 CV_UNUSED(e);
4238                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4239             }
4240             catch (...)
4241             {
4242                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4243             }
4244         }
4245 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4246         CV_Assert(handle == NULL);
4247         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4248         {
4249             if (!buildFromSources(ctx, src_, errmsg))
4250             {
4251                 return false;
4252             }
4253         }
4254         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4255         {
4256             buildflags = joinBuildOptions(buildflags, " -x spir");
4257             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4258             {
4259                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4260             }
4261             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4262             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4263             if (!isLoaded)
4264                 return false;
4265         }
4266         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4267         {
4268             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4269         }
4270         else
4271         {
4272             CV_Error(Error::StsInternal, "Internal error");
4273         }
4274         CV_Assert(handle != NULL);
4275 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4276         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4277         {
4278             try
4279             {
4280                 std::vector<char> binaryBuf;
4281                 getProgramBinary(binaryBuf);
4282                 {
4283                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4284                     BinaryProgramFile file(fname, hash_str.c_str());
4285                     file.write(buildflags, binaryBuf);
4286                 }
4287             }
4288             catch (const cv::Exception& e)
4289             {
4290                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4291             }
4292             catch (...)
4293             {
4294                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4295             }
4296         }
4297 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4298 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4299         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4300         {
4301             std::vector<char> binaryBuf;
4302             getProgramBinary(binaryBuf);
4303             if (!binaryBuf.empty())
4304             {
4305                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4306                 handle = NULL;
4307                 createFromBinary(ctx, binaryBuf, errmsg);
4308             }
4309         }
4310 #endif
4311         return handle != NULL;
4312     }
4313
4314     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4315     {
4316         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4317
4318         size_t retsz = 0;
4319         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4320                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4321         if (log_retval == CL_SUCCESS && retsz > 1)
4322         {
4323             buffer.resize(retsz + 16);
4324             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4325                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4326             if (log_retval == CL_SUCCESS)
4327             {
4328                 if (retsz < buffer.size())
4329                     buffer[retsz] = 0;
4330                 else
4331                     buffer[buffer.size() - 1] = 0;
4332             }
4333             else
4334             {
4335                 buffer[0] = 0;
4336             }
4337         }
4338
4339         errmsg = String(buffer.data());
4340         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4341                 sourceModule_.c_str(), sourceName_.c_str(),
4342                 result, getOpenCLErrorString(result),
4343                 buildflags.c_str(), errmsg.c_str());
4344         fflush(stdout);
4345     }
4346
4347     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4348     {
4349         CV_Assert(src_);
4350         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4351         CV_Assert(handle == NULL);
4352         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4353                 sourceModule_.c_str(), sourceName_.c_str(),
4354                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4355
4356         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4357
4358         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4359         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4360         CV_Assert(srcptr != NULL);
4361         CV_Assert(srclen > 0);
4362
4363         cl_int retval = 0;
4364
4365         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4366         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4367         CV_Assert(handle || retval != CL_SUCCESS);
4368         if (handle && retval == CL_SUCCESS)
4369         {
4370             size_t n = ctx.ndevices();
4371             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4372             cl_device_id* deviceList = deviceListBuf.data();
4373             for (size_t i = 0; i < n; i++)
4374             {
4375                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4376             }
4377
4378             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4379             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4380 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4381             if (retval != CL_SUCCESS)
4382 #endif
4383             {
4384                 dumpBuildLog_(retval, deviceList, errmsg);
4385
4386                 // don't remove "retval != CL_SUCCESS" condition here:
4387                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4388                 if (retval != CL_SUCCESS && handle)
4389                 {
4390                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4391                     handle = NULL;
4392                 }
4393             }
4394 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4395             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4396             {
4397                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4398                 size_t retsz = 0;
4399                 char kernels_buffer[4096] = {0};
4400                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4401                 if (retsz < sizeof(kernels_buffer))
4402                     kernels_buffer[retsz] = 0;
4403                 else
4404                     kernels_buffer[0] = 0;
4405                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4406             }
4407 #endif
4408
4409         }
4410         return handle != NULL;
4411     }
4412
4413     void getProgramBinary(std::vector<char>& buf)
4414     {
4415         CV_Assert(handle);
4416         size_t sz = 0;
4417         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4418         buf.resize(sz);
4419         uchar* ptr = (uchar*)&buf[0];
4420         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4421     }
4422
4423     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4424     {
4425         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4426     }
4427
4428     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4429     {
4430         CV_Assert(handle == NULL);
4431         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4432         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4433
4434         CV_Assert(binarySize > 0);
4435
4436         size_t ndevices = (int)ctx.ndevices();
4437         AutoBuffer<cl_device_id> devices_(ndevices);
4438         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4439         AutoBuffer<size_t> binarySizes_(ndevices);
4440
4441         cl_device_id* devices = devices_.data();
4442         const uchar** binaryPtrs = binaryPtrs_.data();
4443         size_t* binarySizes = binarySizes_.data();
4444         for (size_t i = 0; i < ndevices; i++)
4445         {
4446             devices[i] = (cl_device_id)ctx.device(i).ptr();
4447             binaryPtrs[i] = binaryAddr;
4448             binarySizes[i] = binarySize;
4449         }
4450
4451         cl_int result = 0;
4452         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4453                                            binarySizes, binaryPtrs, NULL, &result);
4454         if (result != CL_SUCCESS)
4455         {
4456             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4457             if (handle)
4458             {
4459                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4460                 handle = NULL;
4461             }
4462         }
4463         if (!handle)
4464         {
4465             return false;
4466         }
4467         // call clBuildProgram()
4468         {
4469             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4470             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4471             if (result != CL_SUCCESS)
4472             {
4473                 dumpBuildLog_(result, devices, errmsg);
4474                 if (handle)
4475                 {
4476                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4477                     handle = NULL;
4478                 }
4479                 return false;
4480             }
4481         }
4482         // check build status
4483         {
4484             cl_build_status build_status = CL_BUILD_NONE;
4485             size_t retsz = 0;
4486             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4487                     sizeof(build_status), &build_status, &retsz));
4488             if (result == CL_SUCCESS)
4489             {
4490                 if (build_status == CL_BUILD_SUCCESS)
4491                 {
4492                     return true;
4493                 }
4494                 else
4495                 {
4496                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4497                     return false;
4498                 }
4499             }
4500             else
4501             {
4502                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4503                 if (handle)
4504                 {
4505                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4506                     handle = NULL;
4507                 }
4508             }
4509         }
4510 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4511         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4512         {
4513             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4514             size_t retsz = 0;
4515             char kernels_buffer[4096] = {0};
4516             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4517             if (retsz < sizeof(kernels_buffer))
4518                 kernels_buffer[retsz] = 0;
4519             else
4520                 kernels_buffer[0] = 0;
4521             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4522         }
4523 #endif
4524         return handle != NULL;
4525     }
4526
4527     ~Impl()
4528     {
4529         if( handle )
4530         {
4531 #ifdef _WIN32
4532             if (!cv::__termination)
4533 #endif
4534             {
4535                 clReleaseProgram(handle);
4536             }
4537             handle = NULL;
4538         }
4539     }
4540
4541     cl_program handle;
4542
4543     String buildflags;
4544     String sourceModule_;
4545     String sourceName_;
4546 };
4547
4548
4549 Program::Program() { p = 0; }
4550
4551 Program::Program(const ProgramSource& src,
4552         const String& buildflags, String& errmsg)
4553 {
4554     p = 0;
4555     create(src, buildflags, errmsg);
4556 }
4557
4558 Program::Program(const Program& prog)
4559 {
4560     p = prog.p;
4561     if(p)
4562         p->addref();
4563 }
4564
4565 Program& Program::operator = (const Program& prog)
4566 {
4567     Impl* newp = (Impl*)prog.p;
4568     if(newp)
4569         newp->addref();
4570     if(p)
4571         p->release();
4572     p = newp;
4573     return *this;
4574 }
4575
4576 Program::~Program()
4577 {
4578     if(p)
4579         p->release();
4580 }
4581
4582 bool Program::create(const ProgramSource& src,
4583             const String& buildflags, String& errmsg)
4584 {
4585     if(p)
4586     {
4587         p->release();
4588         p = NULL;
4589     }
4590     p = new Impl(src, buildflags, errmsg);
4591     if(!p->handle)
4592     {
4593         p->release();
4594         p = 0;
4595     }
4596     return p != 0;
4597 }
4598
4599 void* Program::ptr() const
4600 {
4601     return p ? p->handle : 0;
4602 }
4603
4604 #ifndef OPENCV_REMOVE_DEPRECATED_API
4605 const ProgramSource& Program::source() const
4606 {
4607     CV_Error(Error::StsNotImplemented, "Removed API");
4608 }
4609
4610 bool Program::read(const String& bin, const String& buildflags)
4611 {
4612     CV_UNUSED(bin); CV_UNUSED(buildflags);
4613     CV_Error(Error::StsNotImplemented, "Removed API");
4614 }
4615
4616 bool Program::write(String& bin) const
4617 {
4618     CV_UNUSED(bin);
4619     CV_Error(Error::StsNotImplemented, "Removed API");
4620 }
4621
4622 String Program::getPrefix() const
4623 {
4624     if(!p)
4625         return String();
4626     Context::Impl* ctx_ = Context::getDefault().getImpl();
4627     CV_Assert(ctx_);
4628     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4629 }
4630
4631 String Program::getPrefix(const String& buildflags)
4632 {
4633         Context::Impl* ctx_ = Context::getDefault().getImpl();
4634         CV_Assert(ctx_);
4635         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4636 }
4637 #endif // OPENCV_REMOVE_DEPRECATED_API
4638
4639 void Program::getBinary(std::vector<char>& binary) const
4640 {
4641     CV_Assert(p && "Empty program");
4642     p->getProgramBinary(binary);
4643 }
4644
4645 Program Context::Impl::getProg(const ProgramSource& src,
4646                                const String& buildflags, String& errmsg)
4647 {
4648     size_t limit = getProgramCountLimit();
4649     const ProgramSource::Impl* src_ = src.getImpl();
4650     CV_Assert(src_);
4651     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4652             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4653             getPrefixString().c_str(),
4654             buildflags.c_str());
4655     {
4656         cv::AutoLock lock(program_cache_mutex);
4657         phash_t::iterator it = phash.find(key);
4658         if (it != phash.end())
4659         {
4660             // TODO LRU cache
4661             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4662             if (i != cacheList.end() && i != cacheList.begin())
4663             {
4664                 cacheList.erase(i);
4665                 cacheList.push_front(key);
4666             }
4667             return it->second;
4668         }
4669         { // cleanup program cache
4670             size_t sz = phash.size();
4671             if (limit > 0 && sz >= limit)
4672             {
4673                 static bool warningFlag = false;
4674                 if (!warningFlag)
4675                 {
4676                     printf("\nWARNING: OpenCV-OpenCL:\n"
4677                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4678                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4679                     warningFlag = true;
4680                 }
4681                 while (!cacheList.empty())
4682                 {
4683                     size_t c = phash.erase(cacheList.back());
4684                     cacheList.pop_back();
4685                     if (c != 0)
4686                         break;
4687                 }
4688             }
4689         }
4690     }
4691     Program prog(src, buildflags, errmsg);
4692     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4693     {
4694         cv::AutoLock lock(program_cache_mutex);
4695         phash.insert(std::pair<std::string, Program>(key, prog));
4696         cacheList.push_front(key);
4697     }
4698     return prog;
4699 }
4700
4701
4702 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4703
4704 template<typename T>
4705 class OpenCLBufferPool
4706 {
4707 protected:
4708     ~OpenCLBufferPool() { }
4709 public:
4710     virtual T allocate(size_t size) = 0;
4711     virtual void release(T buffer) = 0;
4712 };
4713
4714 template <typename Derived, typename BufferEntry, typename T>
4715 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4716 {
4717 private:
4718     inline Derived& derived() { return *static_cast<Derived*>(this); }
4719 protected:
4720     Mutex mutex_;
4721
4722     size_t currentReservedSize;
4723     size_t maxReservedSize;
4724
4725     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4726     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4727
4728     // synchronized
4729     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4730     {
4731         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4732         for (; i != allocatedEntries_.end(); ++i)
4733         {
4734             BufferEntry& e = *i;
4735             if (e.clBuffer_ == buffer)
4736             {
4737                 entry = e;
4738                 allocatedEntries_.erase(i);
4739                 return true;
4740             }
4741         }
4742         return false;
4743     }
4744
4745     // synchronized
4746     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4747     {
4748         if (reservedEntries_.empty())
4749             return false;
4750         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4751         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4752         BufferEntry result;
4753         size_t minDiff = (size_t)(-1);
4754         for (; i != reservedEntries_.end(); ++i)
4755         {
4756             BufferEntry& e = *i;
4757             if (e.capacity_ >= size)
4758             {
4759                 size_t diff = e.capacity_ - size;
4760                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4761                 {
4762                     minDiff = diff;
4763                     result_pos = i;
4764                     result = e;
4765                     if (diff == 0)
4766                         break;
4767                 }
4768             }
4769         }
4770         if (result_pos != reservedEntries_.end())
4771         {
4772             //CV_DbgAssert(result == *result_pos);
4773             reservedEntries_.erase(result_pos);
4774             entry = result;
4775             currentReservedSize -= entry.capacity_;
4776             allocatedEntries_.push_back(entry);
4777             return true;
4778         }
4779         return false;
4780     }
4781
4782     // synchronized
4783     void _checkSizeOfReservedEntries()
4784     {
4785         while (currentReservedSize > maxReservedSize)
4786         {
4787             CV_DbgAssert(!reservedEntries_.empty());
4788             const BufferEntry& entry = reservedEntries_.back();
4789             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4790             currentReservedSize -= entry.capacity_;
4791             derived()._releaseBufferEntry(entry);
4792             reservedEntries_.pop_back();
4793         }
4794     }
4795
4796     inline size_t _allocationGranularity(size_t size)
4797     {
4798         // heuristic values
4799         if (size < 1024*1024)
4800             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4801         else if (size < 16*1024*1024)
4802             return 64*1024;
4803         else
4804             return 1024*1024;
4805     }
4806
4807 public:
4808     OpenCLBufferPoolBaseImpl()
4809         : currentReservedSize(0),
4810           maxReservedSize(0)
4811     {
4812         // nothing
4813     }
4814     virtual ~OpenCLBufferPoolBaseImpl()
4815     {
4816         freeAllReservedBuffers();
4817         CV_Assert(reservedEntries_.empty());
4818     }
4819 public:
4820     virtual T allocate(size_t size) CV_OVERRIDE
4821     {
4822         AutoLock locker(mutex_);
4823         BufferEntry entry;
4824         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4825         {
4826             CV_DbgAssert(size <= entry.capacity_);
4827             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4828         }
4829         else
4830         {
4831             derived()._allocateBufferEntry(entry, size);
4832         }
4833         return entry.clBuffer_;
4834     }
4835     virtual void release(T buffer) CV_OVERRIDE
4836     {
4837         AutoLock locker(mutex_);
4838         BufferEntry entry;
4839         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4840         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4841         {
4842             derived()._releaseBufferEntry(entry);
4843         }
4844         else
4845         {
4846             reservedEntries_.push_front(entry);
4847             currentReservedSize += entry.capacity_;
4848             _checkSizeOfReservedEntries();
4849         }
4850     }
4851
4852     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4853     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4854     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4855     {
4856         AutoLock locker(mutex_);
4857         size_t oldMaxReservedSize = maxReservedSize;
4858         maxReservedSize = size;
4859         if (maxReservedSize < oldMaxReservedSize)
4860         {
4861             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4862             for (; i != reservedEntries_.end();)
4863             {
4864                 const BufferEntry& entry = *i;
4865                 if (entry.capacity_ > maxReservedSize / 8)
4866                 {
4867                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4868                     currentReservedSize -= entry.capacity_;
4869                     derived()._releaseBufferEntry(entry);
4870                     i = reservedEntries_.erase(i);
4871                     continue;
4872                 }
4873                 ++i;
4874             }
4875             _checkSizeOfReservedEntries();
4876         }
4877     }
4878     virtual void freeAllReservedBuffers() CV_OVERRIDE
4879     {
4880         AutoLock locker(mutex_);
4881         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4882         for (; i != reservedEntries_.end(); ++i)
4883         {
4884             const BufferEntry& entry = *i;
4885             derived()._releaseBufferEntry(entry);
4886         }
4887         reservedEntries_.clear();
4888         currentReservedSize = 0;
4889     }
4890 };
4891
4892 struct CLBufferEntry
4893 {
4894     cl_mem clBuffer_;
4895     size_t capacity_;
4896     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4897 };
4898
4899 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4900 {
4901 public:
4902     typedef struct CLBufferEntry BufferEntry;
4903 protected:
4904     int createFlags_;
4905 public:
4906     OpenCLBufferPoolImpl(int createFlags = 0)
4907         : createFlags_(createFlags)
4908     {
4909     }
4910
4911     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4912     {
4913         CV_DbgAssert(entry.clBuffer_ == NULL);
4914         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4915         Context& ctx = Context::getDefault();
4916         cl_int retval = CL_SUCCESS;
4917         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4918         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4919         CV_Assert(entry.clBuffer_ != NULL);
4920         if(retval == CL_SUCCESS)
4921         {
4922             CV_IMPL_ADD(CV_IMPL_OCL);
4923         }
4924         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4925                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4926         allocatedEntries_.push_back(entry);
4927     }
4928
4929     void _releaseBufferEntry(const BufferEntry& entry)
4930     {
4931         CV_Assert(entry.capacity_ != 0);
4932         CV_Assert(entry.clBuffer_ != NULL);
4933         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4934                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4935         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4936     }
4937 };
4938
4939 #ifdef HAVE_OPENCL_SVM
4940 struct CLSVMBufferEntry
4941 {
4942     void* clBuffer_;
4943     size_t capacity_;
4944     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4945 };
4946 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4947 {
4948 public:
4949     typedef struct CLSVMBufferEntry BufferEntry;
4950 public:
4951     OpenCLSVMBufferPoolImpl()
4952     {
4953     }
4954
4955     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4956     {
4957         CV_DbgAssert(entry.clBuffer_ == NULL);
4958         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4959
4960         Context& ctx = Context::getDefault();
4961         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4962         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4963         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4964                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4965
4966         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4967         CV_DbgAssert(svmFns->isValid());
4968
4969         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4970         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4971         CV_Assert(buf);
4972
4973         entry.clBuffer_ = buf;
4974         {
4975             CV_IMPL_ADD(CV_IMPL_OCL);
4976         }
4977         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4978                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4979         allocatedEntries_.push_back(entry);
4980     }
4981
4982     void _releaseBufferEntry(const BufferEntry& entry)
4983     {
4984         CV_Assert(entry.capacity_ != 0);
4985         CV_Assert(entry.clBuffer_ != NULL);
4986         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4987                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4988         Context& ctx = Context::getDefault();
4989         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4990         CV_DbgAssert(svmFns->isValid());
4991         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4992         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4993     }
4994 };
4995 #endif
4996
4997
4998
4999 template <bool readAccess, bool writeAccess>
5000 class AlignedDataPtr
5001 {
5002 protected:
5003     const size_t size_;
5004     uchar* const originPtr_;
5005     const size_t alignment_;
5006     uchar* ptr_;
5007     uchar* allocatedPtr_;
5008
5009 public:
5010     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
5011         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
5012     {
5013         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5014         CV_DbgAssert(!readAccess || ptr);
5015         if (((size_t)ptr_ & (alignment - 1)) != 0)
5016         {
5017             allocatedPtr_ = new uchar[size_ + alignment - 1];
5018             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5019             if (readAccess)
5020             {
5021                 memcpy(ptr_, originPtr_, size_);
5022             }
5023         }
5024     }
5025
5026     uchar* getAlignedPtr() const
5027     {
5028         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5029         return ptr_;
5030     }
5031
5032     ~AlignedDataPtr()
5033     {
5034         if (allocatedPtr_)
5035         {
5036             if (writeAccess)
5037             {
5038                 memcpy(originPtr_, ptr_, size_);
5039             }
5040             delete[] allocatedPtr_;
5041             allocatedPtr_ = NULL;
5042         }
5043         ptr_ = NULL;
5044     }
5045 private:
5046     AlignedDataPtr(const AlignedDataPtr&); // disabled
5047     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5048 };
5049
5050 template <bool readAccess, bool writeAccess>
5051 class AlignedDataPtr2D
5052 {
5053 protected:
5054     const size_t size_;
5055     uchar* const originPtr_;
5056     const size_t alignment_;
5057     uchar* ptr_;
5058     uchar* allocatedPtr_;
5059     size_t rows_;
5060     size_t cols_;
5061     size_t step_;
5062
5063 public:
5064     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5065         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5066     {
5067         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5068         CV_DbgAssert(!readAccess || ptr != NULL);
5069         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5070         {
5071             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5072             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5073             if (readAccess)
5074             {
5075                 for (size_t i = 0; i < rows_; i++)
5076                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5077             }
5078         }
5079     }
5080
5081     uchar* getAlignedPtr() const
5082     {
5083         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5084         return ptr_;
5085     }
5086
5087     ~AlignedDataPtr2D()
5088     {
5089         if (allocatedPtr_)
5090         {
5091             if (writeAccess)
5092             {
5093                 for (size_t i = 0; i < rows_; i++)
5094                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5095             }
5096             delete[] allocatedPtr_;
5097             allocatedPtr_ = NULL;
5098         }
5099         ptr_ = NULL;
5100     }
5101 private:
5102     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5103     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5104 };
5105
5106 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5107 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5108 #endif
5109
5110
5111 void Context::Impl::__init_buffer_pools()
5112 {
5113     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5114     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5115     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5116     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5117
5118     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5119     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5120     bufferPool.setMaxReservedSize(poolSize);
5121     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5122     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5123
5124 #ifdef HAVE_OPENCL_SVM
5125     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5126     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5127     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5128     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5129 #endif
5130
5131     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5132 }
5133
5134 class OpenCLAllocator CV_FINAL : public MatAllocator
5135 {
5136 public:
5137     enum AllocatorFlags
5138     {
5139         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5140         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5141 #ifdef HAVE_OPENCL_SVM
5142         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5143 #endif
5144         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5145     };
5146
5147     OpenCLAllocator()
5148     {
5149         matStdAllocator = Mat::getDefaultAllocator();
5150     }
5151     ~OpenCLAllocator()
5152     {
5153         flushCleanupQueue();
5154     }
5155
5156     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5157             AccessFlag flags, UMatUsageFlags usageFlags) const
5158     {
5159         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5160         return u;
5161     }
5162
5163     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5164     {
5165         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5166         return value;
5167     }
5168     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5169     {
5170         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5171         return value;
5172     }
5173
5174     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5175     {
5176         const Device& dev = ctx.device(0);
5177         createFlags = 0;
5178         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5179             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5180
5181         if (!isOpenCLCopyingForced() &&
5182             (isOpenCLMapForced() ||
5183                 (dev.hostUnifiedMemory()
5184 #ifndef __APPLE__
5185                 || dev.isIntel()
5186 #endif
5187                 )
5188             )
5189         )
5190             flags0 = static_cast<UMatData::MemoryFlag>(0);
5191         else
5192             flags0 = UMatData::COPY_ON_MAP;
5193     }
5194
5195     UMatData* allocate(int dims, const int* sizes, int type,
5196                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5197     {
5198         if(!useOpenCL())
5199             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5200
5201         flushCleanupQueue();
5202
5203         CV_Assert(data == 0);
5204         size_t total = CV_ELEM_SIZE(type);
5205         for( int i = dims-1; i >= 0; i-- )
5206         {
5207             if( step )
5208                 step[i] = total;
5209             total *= sizes[i];
5210         }
5211
5212         Context& ctx = Context::getDefault();
5213         if (!ctx.getImpl())
5214             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5215         Context::Impl& ctxImpl = *ctx.getImpl();
5216
5217         int createFlags = 0;
5218         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5219         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5220
5221         void* handle = NULL;
5222         int allocatorFlags = 0;
5223
5224 #ifdef HAVE_OPENCL_SVM
5225         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5226         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5227         {
5228             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5229             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5230
5231             // this property is constant, so single buffer pool can be used here
5232             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5233             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5234         }
5235         else
5236 #endif
5237         if (createFlags == 0)
5238         {
5239             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5240             handle = ctxImpl.getBufferPool().allocate(total);
5241         }
5242         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5243         {
5244             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5245             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5246         }
5247         else
5248         {
5249             CV_Assert(handle != NULL); // Unsupported, throw
5250         }
5251
5252         if (!handle)
5253             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5254
5255         UMatData* u = new UMatData(this);
5256         u->data = 0;
5257         u->size = total;
5258         u->handle = handle;
5259         u->flags = flags0;
5260         u->allocatorFlags_ = allocatorFlags;
5261         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5262         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5263         u->markHostCopyObsolete(true);
5264         opencl_allocator_stats.onAllocate(u->size);
5265         return u;
5266     }
5267
5268     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5269     {
5270         if(!u)
5271             return false;
5272
5273         flushCleanupQueue();
5274
5275         UMatDataAutoLock lock(u);
5276
5277         if(u->handle == 0)
5278         {
5279             CV_Assert(u->origdata != 0);
5280             Context& ctx = Context::getDefault();
5281             int createFlags = 0;
5282             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5283             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5284
5285             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5286
5287             cl_context ctx_handle = (cl_context)ctx.ptr();
5288             int allocatorFlags = 0;
5289             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5290             void* handle = NULL;
5291             cl_int retval = CL_SUCCESS;
5292
5293 #ifdef HAVE_OPENCL_SVM
5294             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5295             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5296             if (useSVM && svmCaps.isSupportFineGrainSystem())
5297             {
5298                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5299                 tempUMatFlags = UMatData::TEMP_UMAT;
5300                 handle = u->origdata;
5301                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5302             }
5303             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5304             {
5305                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5306                 {
5307                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5308
5309                     cl_svm_mem_flags memFlags = createFlags |
5310                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5311
5312                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5313                     CV_DbgAssert(svmFns->isValid());
5314
5315                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5316                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5317                     CV_Assert(handle);
5318
5319                     cl_command_queue q = NULL;
5320                     if (!isFineGrainBuffer)
5321                     {
5322                         q = (cl_command_queue)Queue::getDefault().ptr();
5323                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5324                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5325                                 handle, u->size,
5326                                 0, NULL, NULL);
5327                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5328
5329                     }
5330                     memcpy(handle, u->origdata, u->size);
5331                     if (!isFineGrainBuffer)
5332                     {
5333                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5334                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5335                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5336                     }
5337
5338                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5339                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5340                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5341                 }
5342             }
5343             else
5344 #endif
5345             {
5346                 if( copyOnMap )
5347                     accessFlags &= ~ACCESS_FAST;
5348
5349                 tempUMatFlags = UMatData::TEMP_UMAT;
5350                 if (
5351                 #ifdef __APPLE__
5352                     !copyOnMap &&
5353                 #endif
5354                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5355                     // There are OpenCL runtime issues for less aligned data
5356                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5357                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5358                     // Avoid sharing of host memory between OpenCL buffers
5359                     && !(u->originalUMatData && u->originalUMatData->handle)
5360                 )
5361                 {
5362                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5363                                             u->size, u->origdata, &retval);
5364                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5365                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5366                 }
5367                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5368                 {
5369                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5370                                                u->size, u->origdata, &retval);
5371                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5372                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5373                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5374                 }
5375             }
5376             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5377             if(!handle || retval != CL_SUCCESS)
5378                 return false;
5379             u->handle = handle;
5380             u->prevAllocator = u->currAllocator;
5381             u->currAllocator = this;
5382             u->flags |= tempUMatFlags | flags0;
5383             u->allocatorFlags_ = allocatorFlags;
5384         }
5385         if (!!(accessFlags & ACCESS_WRITE))
5386             u->markHostCopyObsolete(true);
5387         opencl_allocator_stats.onAllocate(u->size);
5388         return true;
5389     }
5390
5391     /*void sync(UMatData* u) const
5392     {
5393         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5394         UMatDataAutoLock lock(u);
5395
5396         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5397         {
5398             if( u->tempCopiedUMat() )
5399             {
5400                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5401                                     u->size, u->origdata, 0, 0, 0);
5402             }
5403             else
5404             {
5405                 cl_int retval = 0;
5406                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5407                                                 (CL_MAP_READ | CL_MAP_WRITE),
5408                                                 0, u->size, 0, 0, 0, &retval);
5409                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5410                 clFinish(q);
5411             }
5412             u->markHostCopyObsolete(false);
5413         }
5414         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5415         {
5416             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5417                                  u->size, u->data, 0, 0, 0);
5418         }
5419     }*/
5420
5421     void deallocate(UMatData* u) const CV_OVERRIDE
5422     {
5423         if(!u)
5424             return;
5425
5426         CV_Assert(u->urefcount == 0);
5427         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5428
5429         CV_Assert(u->handle != 0);
5430         CV_Assert(u->mapcount == 0);
5431
5432         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5433             addToCleanupQueue(u);
5434         else
5435             deallocate_(u);
5436     }
5437
5438     void deallocate_(UMatData* u) const
5439     {
5440         CV_Assert(u);
5441         CV_Assert(u->handle);
5442         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5443         {
5444             opencl_allocator_stats.onFree(u->size);
5445         }
5446
5447 #ifdef _WIN32
5448         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5449             return;             // avoid any OpenCL calls
5450 #endif
5451         if(u->tempUMat())
5452         {
5453             CV_Assert(u->origdata);
5454 //            UMatDataAutoLock lock(u);
5455
5456             if (u->hostCopyObsolete())
5457             {
5458 #ifdef HAVE_OPENCL_SVM
5459                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5460                 {
5461                     Context& ctx = Context::getDefault();
5462                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5463                     CV_DbgAssert(svmFns->isValid());
5464
5465                     if( u->tempCopiedUMat() )
5466                     {
5467                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5468                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5469                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5470                         cl_command_queue q = NULL;
5471                         if (!isFineGrainBuffer)
5472                         {
5473                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5474                             q = (cl_command_queue)Queue::getDefault().ptr();
5475                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5476                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5477                                     u->handle, u->size,
5478                                     0, NULL, NULL);
5479                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5480                         }
5481                         clFinish(q);
5482                         memcpy(u->origdata, u->handle, u->size);
5483                         if (!isFineGrainBuffer)
5484                         {
5485                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5486                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5487                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5488                         }
5489                     }
5490                     else
5491                     {
5492                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5493                         // nothing
5494                     }
5495                 }
5496                 else
5497 #endif
5498                 {
5499                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5500                     if( u->tempCopiedUMat() )
5501                     {
5502                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5503                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5504                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5505                     }
5506                     else
5507                     {
5508                         cl_int retval = 0;
5509                         if (u->tempUMat())
5510                         {
5511                             CV_Assert(u->mapcount == 0);
5512                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5513                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5514                                 (CL_MAP_READ | CL_MAP_WRITE),
5515                                 0, u->size, 0, 0, 0, &retval);
5516                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5517                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5518                             if (u->originalUMatData)
5519                             {
5520                                 CV_Assert(u->originalUMatData->data == data);
5521                             }
5522                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5523                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5524                             CV_OCL_DBG_CHECK(clFinish(q));
5525                         }
5526                     }
5527                 }
5528                 u->markHostCopyObsolete(false);
5529             }
5530             else
5531             {
5532                 // nothing
5533             }
5534 #ifdef HAVE_OPENCL_SVM
5535             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5536             {
5537                 if( u->tempCopiedUMat() )
5538                 {
5539                     Context& ctx = Context::getDefault();
5540                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5541                     CV_DbgAssert(svmFns->isValid());
5542
5543                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5544                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5545                 }
5546             }
5547             else
5548 #endif
5549             {
5550                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5551                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5552             }
5553             u->handle = 0;
5554             u->markDeviceCopyObsolete(true);
5555             u->currAllocator = u->prevAllocator;
5556             u->prevAllocator = NULL;
5557             if(u->data && u->copyOnMap() && u->data != u->origdata)
5558                 fastFree(u->data);
5559             u->data = u->origdata;
5560             u->currAllocator->deallocate(u);
5561             u = NULL;
5562         }
5563         else
5564         {
5565             CV_Assert(u->origdata == NULL);
5566             if(u->data && u->copyOnMap() && u->data != u->origdata)
5567             {
5568                 fastFree(u->data);
5569                 u->data = 0;
5570                 u->markHostCopyObsolete(true);
5571             }
5572             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5573             {
5574                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5575                 CV_Assert(pCtx);
5576                 ocl::Context& ctx = *pCtx.get();
5577                 CV_Assert(ctx.getImpl());
5578                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5579             }
5580             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5581             {
5582                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5583                 CV_Assert(pCtx);
5584                 ocl::Context& ctx = *pCtx.get();
5585                 CV_Assert(ctx.getImpl());
5586                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5587             }
5588 #ifdef HAVE_OPENCL_SVM
5589             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5590             {
5591                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5592                 CV_Assert(pCtx);
5593                 ocl::Context& ctx = *pCtx.get();
5594                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5595                 {
5596                     //nothing
5597                 }
5598                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5599                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5600                 {
5601                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5602                     CV_DbgAssert(svmFns->isValid());
5603                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5604
5605                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5606                     {
5607                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5608                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5609                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5610                     }
5611                 }
5612                 CV_Assert(ctx.getImpl());
5613                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5614             }
5615 #endif
5616             else
5617             {
5618                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5619             }
5620             u->handle = 0;
5621             u->markDeviceCopyObsolete(true);
5622             delete u;
5623             u = NULL;
5624         }
5625         CV_Assert(u == NULL);
5626     }
5627
5628     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5629     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5630     {
5631         CV_Assert(u && u->handle);
5632
5633         if (!!(accessFlags & ACCESS_WRITE))
5634             u->markDeviceCopyObsolete(true);
5635
5636         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5637
5638         {
5639             if( !u->copyOnMap() )
5640             {
5641                 // TODO
5642                 // because there can be other map requests for the same UMat with different access flags,
5643                 // we use the universal (read-write) access mode.
5644 #ifdef HAVE_OPENCL_SVM
5645                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5646                 {
5647                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5648                     {
5649                         Context& ctx = Context::getDefault();
5650                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5651                         CV_DbgAssert(svmFns->isValid());
5652
5653                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5654                         {
5655                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5656                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5657                                     u->handle, u->size,
5658                                     0, NULL, NULL);
5659                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5660                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5661                         }
5662                     }
5663                     clFinish(q);
5664                     u->data = (uchar*)u->handle;
5665                     u->markHostCopyObsolete(false);
5666                     u->markDeviceMemMapped(true);
5667                     return;
5668                 }
5669 #endif
5670
5671                 cl_int retval = CL_SUCCESS;
5672                 if (!u->deviceMemMapped())
5673                 {
5674                     CV_Assert(u->refcount == 1);
5675                     CV_Assert(u->mapcount++ == 0);
5676                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5677                                                          (CL_MAP_READ | CL_MAP_WRITE),
5678                                                          0, u->size, 0, 0, 0, &retval);
5679                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5680                 }
5681                 if (u->data && retval == CL_SUCCESS)
5682                 {
5683                     u->markHostCopyObsolete(false);
5684                     u->markDeviceMemMapped(true);
5685                     return;
5686                 }
5687
5688                 // TODO Is it really a good idea and was it tested well?
5689                 // if map failed, switch to copy-on-map mode for the particular buffer
5690                 u->flags |= UMatData::COPY_ON_MAP;
5691             }
5692
5693             if(!u->data)
5694             {
5695                 u->data = (uchar*)fastMalloc(u->size);
5696                 u->markHostCopyObsolete(true);
5697             }
5698         }
5699
5700         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5701         {
5702             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5703 #ifdef HAVE_OPENCL_SVM
5704             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5705 #endif
5706             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5707                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5708             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5709                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5710             u->markHostCopyObsolete(false);
5711         }
5712     }
5713
5714     void unmap(UMatData* u) const CV_OVERRIDE
5715     {
5716         if(!u)
5717             return;
5718
5719
5720         CV_Assert(u->handle != 0);
5721
5722         UMatDataAutoLock autolock(u);
5723
5724         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5725         cl_int retval = 0;
5726         if( !u->copyOnMap() && u->deviceMemMapped() )
5727         {
5728             CV_Assert(u->data != NULL);
5729 #ifdef HAVE_OPENCL_SVM
5730             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5731             {
5732                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5733                 {
5734                     Context& ctx = Context::getDefault();
5735                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5736                     CV_DbgAssert(svmFns->isValid());
5737
5738                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5739                     {
5740                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5741                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5742                                 0, NULL, NULL);
5743                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5744                         clFinish(q);
5745                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5746                     }
5747                 }
5748                 if (u->refcount == 0)
5749                     u->data = 0;
5750                 u->markDeviceCopyObsolete(false);
5751                 u->markHostCopyObsolete(true);
5752                 return;
5753             }
5754 #endif
5755             if (u->refcount == 0)
5756             {
5757                 CV_Assert(u->mapcount-- == 1);
5758                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5759                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5760                 if (Device::getDefault().isAMD())
5761                 {
5762                     // required for multithreaded applications (see stitching test)
5763                     CV_OCL_DBG_CHECK(clFinish(q));
5764                 }
5765                 u->markDeviceMemMapped(false);
5766                 u->data = 0;
5767                 u->markDeviceCopyObsolete(false);
5768                 u->markHostCopyObsolete(true);
5769             }
5770         }
5771         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5772         {
5773             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5774 #ifdef HAVE_OPENCL_SVM
5775             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5776 #endif
5777             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5778                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5779             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5780                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5781             u->markDeviceCopyObsolete(false);
5782             u->markHostCopyObsolete(true);
5783         }
5784     }
5785
5786     bool checkContinuous(int dims, const size_t sz[],
5787                          const size_t srcofs[], const size_t srcstep[],
5788                          const size_t dstofs[], const size_t dststep[],
5789                          size_t& total, size_t new_sz[],
5790                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5791                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5792     {
5793         bool iscontinuous = true;
5794         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5795         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5796         total = sz[dims-1];
5797         for( int i = dims-2; i >= 0; i-- )
5798         {
5799             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5800                 iscontinuous = false;
5801             total *= sz[i];
5802             if( srcofs )
5803                 srcrawofs += srcofs[i]*srcstep[i];
5804             if( dstofs )
5805                 dstrawofs += dstofs[i]*dststep[i];
5806         }
5807
5808         if( !iscontinuous )
5809         {
5810             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5811             if( dims == 2 )
5812             {
5813                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5814                 // we assume that new_... arrays are initialized by caller
5815                 // with 0's, so there is no else branch
5816                 if( srcofs )
5817                 {
5818                     new_srcofs[0] = srcofs[1];
5819                     new_srcofs[1] = srcofs[0];
5820                     new_srcofs[2] = 0;
5821                 }
5822
5823                 if( dstofs )
5824                 {
5825                     new_dstofs[0] = dstofs[1];
5826                     new_dstofs[1] = dstofs[0];
5827                     new_dstofs[2] = 0;
5828                 }
5829
5830                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5831                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5832             }
5833             else
5834             {
5835                 // we could check for dims == 3 here,
5836                 // but from user perspective this one is more informative
5837                 CV_Assert(dims <= 3);
5838                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5839                 if( srcofs )
5840                 {
5841                     new_srcofs[0] = srcofs[2];
5842                     new_srcofs[1] = srcofs[1];
5843                     new_srcofs[2] = srcofs[0];
5844                 }
5845
5846                 if( dstofs )
5847                 {
5848                     new_dstofs[0] = dstofs[2];
5849                     new_dstofs[1] = dstofs[1];
5850                     new_dstofs[2] = dstofs[0];
5851                 }
5852
5853                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5854                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5855             }
5856         }
5857         return iscontinuous;
5858     }
5859
5860     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5861                   const size_t srcofs[], const size_t srcstep[],
5862                   const size_t dststep[]) const CV_OVERRIDE
5863     {
5864         if(!u)
5865             return;
5866         UMatDataAutoLock autolock(u);
5867
5868         if( u->data && !u->hostCopyObsolete() )
5869         {
5870             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5871             return;
5872         }
5873         CV_Assert( u->handle != 0 );
5874
5875         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5876
5877         size_t total = 0, new_sz[] = {0, 0, 0};
5878         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5879         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5880
5881         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5882                                             total, new_sz,
5883                                             srcrawofs, new_srcofs, new_srcstep,
5884                                             dstrawofs, new_dstofs, new_dststep);
5885
5886 #ifdef HAVE_OPENCL_SVM
5887         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5888         {
5889             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5890             Context& ctx = Context::getDefault();
5891             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5892             CV_DbgAssert(svmFns->isValid());
5893
5894             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5895             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5896             {
5897                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5898                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5899                         u->handle, u->size,
5900                         0, NULL, NULL);
5901                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5902             }
5903             clFinish(q);
5904             if( iscontinuous )
5905             {
5906                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5907             }
5908             else
5909             {
5910                 // This code is from MatAllocator::download()
5911                 int isz[CV_MAX_DIM];
5912                 uchar* srcptr = (uchar*)u->handle;
5913                 for( int i = 0; i < dims; i++ )
5914                 {
5915                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5916                     if( sz[i] == 0 )
5917                     return;
5918                     if( srcofs )
5919                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5920                     isz[i] = (int)sz[i];
5921                 }
5922
5923                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5924                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5925
5926                 const Mat* arrays[] = { &src, &dst };
5927                 uchar* ptrs[2];
5928                 NAryMatIterator it(arrays, ptrs, 2);
5929                 size_t j, planesz = it.size;
5930
5931                 for( j = 0; j < it.nplanes; j++, ++it )
5932                     memcpy(ptrs[1], ptrs[0], planesz);
5933             }
5934             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5935             {
5936                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5937                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5938                         0, NULL, NULL);
5939                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5940                 clFinish(q);
5941             }
5942         }
5943         else
5944 #endif
5945         {
5946             if( iscontinuous )
5947             {
5948                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5949                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5950                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5951             }
5952             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5953             {
5954                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5955                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5956                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5957                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5958                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5959                 uchar* ptr = alignedPtr.getAlignedPtr();
5960
5961                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5962                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5963                 total = std::min(total, u->size - new_srcrawofs);
5964                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5965                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5966                 for( size_t i = 0; i < new_sz[1]; i++ )
5967                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5968             }
5969             else
5970             {
5971                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5972                 uchar* ptr = alignedPtr.getAlignedPtr();
5973
5974                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5975                     new_srcofs, new_dstofs, new_sz,
5976                     new_srcstep[0], 0,
5977                     new_dststep[0], 0,
5978                     ptr, 0, 0, 0));
5979             }
5980         }
5981     }
5982
5983     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5984                 const size_t dstofs[], const size_t dststep[],
5985                 const size_t srcstep[]) const CV_OVERRIDE
5986     {
5987         if(!u)
5988             return;
5989
5990         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5991         CV_Assert(u->refcount == 0 || u->tempUMat());
5992
5993         size_t total = 0, new_sz[] = {0, 0, 0};
5994         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5995         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5996
5997         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5998                                             total, new_sz,
5999                                             srcrawofs, new_srcofs, new_srcstep,
6000                                             dstrawofs, new_dstofs, new_dststep);
6001
6002         UMatDataAutoLock autolock(u);
6003
6004         // if there is cached CPU copy of the GPU matrix,
6005         // we could use it as a destination.
6006         // we can do it in 2 cases:
6007         //    1. we overwrite the whole content
6008         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
6009         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
6010         {
6011             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
6012             u->markHostCopyObsolete(false);
6013             u->markDeviceCopyObsolete(true);
6014             return;
6015         }
6016
6017         CV_Assert( u->handle != 0 );
6018         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6019
6020 #ifdef HAVE_OPENCL_SVM
6021         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6022         {
6023             CV_DbgAssert(u->data == NULL || u->data == u->handle);
6024             Context& ctx = Context::getDefault();
6025             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6026             CV_DbgAssert(svmFns->isValid());
6027
6028             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
6029             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6030             {
6031                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
6032                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
6033                         u->handle, u->size,
6034                         0, NULL, NULL);
6035                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6036             }
6037             clFinish(q);
6038             if( iscontinuous )
6039             {
6040                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6041             }
6042             else
6043             {
6044                 // This code is from MatAllocator::upload()
6045                 int isz[CV_MAX_DIM];
6046                 uchar* dstptr = (uchar*)u->handle;
6047                 for( int i = 0; i < dims; i++ )
6048                 {
6049                     CV_Assert( sz[i] <= (size_t)INT_MAX );
6050                     if( sz[i] == 0 )
6051                     return;
6052                     if( dstofs )
6053                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6054                     isz[i] = (int)sz[i];
6055                 }
6056
6057                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6058                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6059
6060                 const Mat* arrays[] = { &src, &dst };
6061                 uchar* ptrs[2];
6062                 NAryMatIterator it(arrays, ptrs, 2);
6063                 size_t j, planesz = it.size;
6064
6065                 for( j = 0; j < it.nplanes; j++, ++it )
6066                     memcpy(ptrs[1], ptrs[0], planesz);
6067             }
6068             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6069             {
6070                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6071                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6072                         0, NULL, NULL);
6073                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6074                 clFinish(q);
6075             }
6076         }
6077         else
6078 #endif
6079         {
6080             if( iscontinuous )
6081             {
6082                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6083                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6084                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6085                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6086                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6087             }
6088             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6089             {
6090                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6091                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6092                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6093                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6094                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6095                 uchar* ptr = alignedPtr.getAlignedPtr();
6096
6097                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6098                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6099                 total = std::min(total, u->size - new_dstrawofs);
6100                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6101                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6102                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6103                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6104                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6105                 for( size_t i = 0; i < new_sz[1]; i++ )
6106                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6107                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6108                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6109             }
6110             else
6111             {
6112                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6113                 uchar* ptr = alignedPtr.getAlignedPtr();
6114
6115                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6116                     new_dstofs, new_srcofs, new_sz,
6117                     new_dststep[0], 0,
6118                     new_srcstep[0], 0,
6119                     ptr, 0, 0, 0));
6120             }
6121         }
6122         u->markHostCopyObsolete(true);
6123 #ifdef HAVE_OPENCL_SVM
6124         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6125                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6126         {
6127             // nothing
6128         }
6129         else
6130 #endif
6131         {
6132             u->markHostCopyObsolete(true);
6133         }
6134         u->markDeviceCopyObsolete(false);
6135     }
6136
6137     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6138               const size_t srcofs[], const size_t srcstep[],
6139               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6140     {
6141         if(!src || !dst)
6142             return;
6143
6144         size_t total = 0, new_sz[] = {0, 0, 0};
6145         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6146         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6147
6148         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6149                                             total, new_sz,
6150                                             srcrawofs, new_srcofs, new_srcstep,
6151                                             dstrawofs, new_dstofs, new_dststep);
6152
6153         UMatDataAutoLock src_autolock(src, dst);
6154
6155         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6156         {
6157             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6158             return;
6159         }
6160         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6161         {
6162             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6163             dst->markHostCopyObsolete(false);
6164 #ifdef HAVE_OPENCL_SVM
6165             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6166                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6167             {
6168                 // nothing
6169             }
6170             else
6171 #endif
6172             {
6173                 dst->markDeviceCopyObsolete(true);
6174             }
6175             return;
6176         }
6177
6178         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6179         CV_Assert(dst->refcount == 0);
6180         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6181
6182         cl_int retval = CL_SUCCESS;
6183 #ifdef HAVE_OPENCL_SVM
6184         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6185                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6186         {
6187             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6188                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6189             {
6190                 Context& ctx = Context::getDefault();
6191                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6192                 CV_DbgAssert(svmFns->isValid());
6193
6194                 if( iscontinuous )
6195                 {
6196                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6197                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6198                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6199                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6200                             total, 0, NULL, NULL);
6201                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6202                 }
6203                 else
6204                 {
6205                     clFinish(q);
6206                     // This code is from MatAllocator::download()/upload()
6207                     int isz[CV_MAX_DIM];
6208                     uchar* srcptr = (uchar*)src->handle;
6209                     for( int i = 0; i < dims; i++ )
6210                     {
6211                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6212                         if( sz[i] == 0 )
6213                         return;
6214                         if( srcofs )
6215                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6216                         isz[i] = (int)sz[i];
6217                     }
6218                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6219
6220                     uchar* dstptr = (uchar*)dst->handle;
6221                     for( int i = 0; i < dims; i++ )
6222                     {
6223                         if( dstofs )
6224                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6225                     }
6226                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6227
6228                     const Mat* arrays[] = { &m_src, &m_dst };
6229                     uchar* ptrs[2];
6230                     NAryMatIterator it(arrays, ptrs, 2);
6231                     size_t j, planesz = it.size;
6232
6233                     for( j = 0; j < it.nplanes; j++, ++it )
6234                         memcpy(ptrs[1], ptrs[0], planesz);
6235                 }
6236             }
6237             else
6238             {
6239                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6240                 {
6241                     map(src, ACCESS_READ);
6242                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6243                     unmap(src);
6244                 }
6245                 else
6246                 {
6247                     map(dst, ACCESS_WRITE);
6248                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6249                     unmap(dst);
6250                 }
6251             }
6252         }
6253         else
6254 #endif
6255         {
6256             if( iscontinuous )
6257             {
6258                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6259                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6260                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6261                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6262             }
6263             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6264             {
6265                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6266                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6267                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6268                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6269                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6270
6271                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6272                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6273                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6274                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6275                 uchar* srcptr = srcBuf.getAlignedPtr();
6276                 uchar* dstptr = dstBuf.getAlignedPtr();
6277
6278                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6279
6280                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6281                 src_total = std::min(src_total, src->size - new_srcrawofs);
6282                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6283                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6284
6285                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6286                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6287                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6288                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6289
6290                 for( size_t i = 0; i < new_sz[1]; i++ )
6291                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6292                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6293                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6294                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6295             }
6296             else
6297             {
6298                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6299                                                    new_srcofs, new_dstofs, new_sz,
6300                                                    new_srcstep[0], 0,
6301                                                    new_dststep[0], 0,
6302                                                    0, 0, 0));
6303             }
6304         }
6305         if (retval == CL_SUCCESS)
6306         {
6307             CV_IMPL_ADD(CV_IMPL_OCL)
6308         }
6309
6310 #ifdef HAVE_OPENCL_SVM
6311         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6312             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6313         {
6314             // nothing
6315         }
6316         else
6317 #endif
6318         {
6319             dst->markHostCopyObsolete(true);
6320         }
6321         dst->markDeviceCopyObsolete(false);
6322
6323         if( _sync )
6324         {
6325             CV_OCL_DBG_CHECK(clFinish(q));
6326         }
6327     }
6328
6329     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6330     {
6331         ocl::Context ctx = Context::getDefault();
6332         if (ctx.empty())
6333             return NULL;
6334 #ifdef HAVE_OPENCL_SVM
6335         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6336         {
6337             return &ctx.getImpl()->getBufferPoolSVM();
6338         }
6339 #endif
6340         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6341         {
6342             return &ctx.getImpl()->getBufferPoolHostPtr();
6343         }
6344         if (id != NULL && strcmp(id, "OCL") != 0)
6345         {
6346             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6347         }
6348         return &ctx.getImpl()->getBufferPool();
6349     }
6350
6351     MatAllocator* matStdAllocator;
6352
6353     mutable cv::Mutex cleanupQueueMutex;
6354     mutable std::deque<UMatData*> cleanupQueue;
6355
6356     void flushCleanupQueue() const
6357     {
6358         if (!cleanupQueue.empty())
6359         {
6360             std::deque<UMatData*> q;
6361             {
6362                 cv::AutoLock lock(cleanupQueueMutex);
6363                 q.swap(cleanupQueue);
6364             }
6365             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6366             {
6367                 deallocate_(*i);
6368             }
6369         }
6370     }
6371     void addToCleanupQueue(UMatData* u) const
6372     {
6373         //TODO: Validation check: CV_Assert(!u->tempUMat());
6374         {
6375             cv::AutoLock lock(cleanupQueueMutex);
6376             cleanupQueue.push_back(u);
6377         }
6378     }
6379 };
6380
6381 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6382 {
6383     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6384     return g_allocator;
6385 }
6386 MatAllocator* getOpenCLAllocator()
6387 {
6388     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6389 }
6390
6391 }} // namespace cv::ocl
6392
6393
6394 namespace cv {
6395
6396 // three funcs below are implemented in umatrix.cpp
6397 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6398               bool autoSteps = false );
6399 void finalizeHdr(UMat& m);
6400
6401 } // namespace cv
6402
6403
6404 namespace cv { namespace ocl {
6405
6406 /*
6407 // Convert OpenCL buffer memory to UMat
6408 */
6409 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6410 {
6411     int d = 2;
6412     int sizes[] = { rows, cols };
6413
6414     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6415
6416     dst.release();
6417
6418     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6419     dst.usageFlags = USAGE_DEFAULT;
6420
6421     setSize(dst, d, sizes, 0, true);
6422     dst.offset = 0;
6423
6424     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6425     cl_mem_object_type mem_type = 0;
6426
6427     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6428
6429     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6430
6431     size_t total = 0;
6432     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6433
6434     CV_OCL_CHECK(clRetainMemObject(memobj));
6435
6436     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6437     CV_Assert(total >= rows * step);
6438
6439     // attach clBuffer to UMatData
6440     dst.u = new UMatData(getOpenCLAllocator());
6441     dst.u->data            = 0;
6442     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6443     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6444     dst.u->handle          = cl_mem_buffer;
6445     dst.u->origdata        = 0;
6446     dst.u->prevAllocator   = 0;
6447     dst.u->size            = total;
6448
6449     finalizeHdr(dst);
6450     dst.addref();
6451
6452     return;
6453 } // convertFromBuffer()
6454
6455
6456 /*
6457 // Convert OpenCL image2d_t memory to UMat
6458 */
6459 void convertFromImage(void* cl_mem_image, UMat& dst)
6460 {
6461     cl_mem             clImage = (cl_mem)cl_mem_image;
6462     cl_mem_object_type mem_type = 0;
6463
6464     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6465
6466     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6467
6468     cl_image_format fmt = { 0, 0 };
6469     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6470
6471     int depth = CV_8U;
6472     switch (fmt.image_channel_data_type)
6473     {
6474     case CL_UNORM_INT8:
6475     case CL_UNSIGNED_INT8:
6476         depth = CV_8U;
6477         break;
6478
6479     case CL_SNORM_INT8:
6480     case CL_SIGNED_INT8:
6481         depth = CV_8S;
6482         break;
6483
6484     case CL_UNORM_INT16:
6485     case CL_UNSIGNED_INT16:
6486         depth = CV_16U;
6487         break;
6488
6489     case CL_SNORM_INT16:
6490     case CL_SIGNED_INT16:
6491         depth = CV_16S;
6492         break;
6493
6494     case CL_SIGNED_INT32:
6495         depth = CV_32S;
6496         break;
6497
6498     case CL_FLOAT:
6499         depth = CV_32F;
6500         break;
6501
6502     default:
6503         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6504     }
6505
6506     int type = CV_8UC1;
6507     switch (fmt.image_channel_order)
6508     {
6509     case CL_R:
6510         type = CV_MAKE_TYPE(depth, 1);
6511         break;
6512
6513     case CL_RGBA:
6514     case CL_BGRA:
6515     case CL_ARGB:
6516         type = CV_MAKE_TYPE(depth, 4);
6517         break;
6518
6519     default:
6520         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6521         break;
6522     }
6523
6524     size_t step = 0;
6525     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6526
6527     size_t w = 0;
6528     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6529
6530     size_t h = 0;
6531     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6532
6533     dst.create((int)h, (int)w, type);
6534
6535     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6536
6537     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6538
6539     size_t offset = 0;
6540     size_t src_origin[3] = { 0, 0, 0 };
6541     size_t region[3] = { w, h, 1 };
6542     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6543
6544     CV_OCL_CHECK(clFinish(q));
6545
6546     return;
6547 } // convertFromImage()
6548
6549
6550 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6551
6552 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6553 {
6554     cl_uint numDevices = 0;
6555     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6556     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6557     {
6558         CV_OCL_DBG_CHECK_RESULT(status,
6559             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6560     }
6561
6562     if (numDevices == 0)
6563     {
6564         devices.clear();
6565         return;
6566     }
6567
6568     devices.resize((size_t)numDevices);
6569     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6570 }
6571
6572 struct PlatformInfo::Impl
6573 {
6574     Impl(void* id)
6575     {
6576         refcount = 1;
6577         handle = *(cl_platform_id*)id;
6578         getDevices(devices, handle);
6579
6580         version_ = getStrProp(CL_PLATFORM_VERSION);
6581         parseOpenCLVersion(version_, versionMajor_, versionMinor_);
6582     }
6583
6584     String getStrProp(cl_platform_info prop) const
6585     {
6586         char buf[1024];
6587         size_t sz=0;
6588         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6589             sz < sizeof(buf) ? String(buf) : String();
6590     }
6591
6592     IMPLEMENT_REFCOUNTABLE();
6593     std::vector<cl_device_id> devices;
6594     cl_platform_id handle;
6595
6596     String version_;
6597     int versionMajor_;
6598     int versionMinor_;
6599 };
6600
6601 PlatformInfo::PlatformInfo()
6602 {
6603     p = 0;
6604 }
6605
6606 PlatformInfo::PlatformInfo(void* platform_id)
6607 {
6608     p = new Impl(platform_id);
6609 }
6610
6611 PlatformInfo::~PlatformInfo()
6612 {
6613     if(p)
6614         p->release();
6615 }
6616
6617 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6618 {
6619     if (i.p)
6620         i.p->addref();
6621     p = i.p;
6622 }
6623
6624 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6625 {
6626     if (i.p != p)
6627     {
6628         if (i.p)
6629             i.p->addref();
6630         if (p)
6631             p->release();
6632         p = i.p;
6633     }
6634     return *this;
6635 }
6636
6637 int PlatformInfo::deviceNumber() const
6638 {
6639     return p ? (int)p->devices.size() : 0;
6640 }
6641
6642 void PlatformInfo::getDevice(Device& device, int d) const
6643 {
6644     CV_Assert(p && d < (int)p->devices.size() );
6645     if(p)
6646         device.set(p->devices[d]);
6647 }
6648
6649 String PlatformInfo::name() const
6650 {
6651     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6652 }
6653
6654 String PlatformInfo::vendor() const
6655 {
6656     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6657 }
6658
6659 String PlatformInfo::version() const
6660 {
6661     return p ? p->version_ : String();
6662 }
6663
6664 int PlatformInfo::versionMajor() const
6665 {
6666     CV_Assert(p);
6667     return p->versionMajor_;
6668 }
6669
6670 int PlatformInfo::versionMinor() const
6671 {
6672     CV_Assert(p);
6673     return p->versionMinor_;
6674 }
6675
6676 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6677 {
6678     cl_uint numPlatforms = 0;
6679     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6680
6681     if (numPlatforms == 0)
6682     {
6683         platforms.clear();
6684         return;
6685     }
6686
6687     platforms.resize((size_t)numPlatforms);
6688     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6689 }
6690
6691 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6692 {
6693     std::vector<cl_platform_id> platforms;
6694     getPlatforms(platforms);
6695
6696     for (size_t i = 0; i < platforms.size(); i++)
6697         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6698 }
6699
6700 const char* typeToStr(int type)
6701 {
6702     static const char* tab[]=
6703     {
6704         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6705         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6706         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6707         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6708         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6709         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6710         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6711         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6712         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6713     };
6714     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6715     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6716     CV_Assert(result);
6717     return result;
6718 }
6719
6720 const char* memopTypeToStr(int type)
6721 {
6722     static const char* tab[] =
6723     {
6724         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6725         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6726         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6727         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6728         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6729         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6730         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6731         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6732         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6733     };
6734     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6735     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6736     CV_Assert(result);
6737     return result;
6738 }
6739
6740 const char* vecopTypeToStr(int type)
6741 {
6742     static const char* tab[] =
6743     {
6744         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6745         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6746         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6747         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6748         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6749         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6750         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6751         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6752         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6753     };
6754     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6755     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6756     CV_Assert(result);
6757     return result;
6758 }
6759
6760 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6761 {
6762     if( sdepth == ddepth )
6763         return "noconvert";
6764     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6765     if( ddepth >= CV_32F ||
6766         (ddepth == CV_32S && sdepth < CV_32S) ||
6767         (ddepth == CV_16S && sdepth <= CV_8S) ||
6768         (ddepth == CV_16U && sdepth == CV_8U))
6769     {
6770         sprintf(buf, "convert_%s", typestr);
6771     }
6772     else if( sdepth >= CV_32F )
6773         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6774     else
6775         sprintf(buf, "convert_%s_sat", typestr);
6776
6777     return buf;
6778 }
6779
6780 const char* getOpenCLErrorString(int errorCode)
6781 {
6782 #define CV_OCL_CODE(id) case id: return #id
6783 #define CV_OCL_CODE_(id, name) case id: return #name
6784     switch (errorCode)
6785     {
6786     CV_OCL_CODE(CL_SUCCESS);
6787     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6788     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6789     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6790     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6791     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6792     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6793     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6794     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6795     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6796     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6797     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6798     CV_OCL_CODE(CL_MAP_FAILURE);
6799     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6800     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6801     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6802     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6803     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6804     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6805     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6806     CV_OCL_CODE(CL_INVALID_VALUE);
6807     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6808     CV_OCL_CODE(CL_INVALID_PLATFORM);
6809     CV_OCL_CODE(CL_INVALID_DEVICE);
6810     CV_OCL_CODE(CL_INVALID_CONTEXT);
6811     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6812     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6813     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6814     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6815     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6816     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6817     CV_OCL_CODE(CL_INVALID_SAMPLER);
6818     CV_OCL_CODE(CL_INVALID_BINARY);
6819     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6820     CV_OCL_CODE(CL_INVALID_PROGRAM);
6821     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6822     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6823     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6824     CV_OCL_CODE(CL_INVALID_KERNEL);
6825     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6826     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6827     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6828     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6829     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6830     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6831     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6832     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6833     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6834     CV_OCL_CODE(CL_INVALID_EVENT);
6835     CV_OCL_CODE(CL_INVALID_OPERATION);
6836     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6837     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6838     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6839     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6840     // OpenCL 1.1
6841     CV_OCL_CODE(CL_INVALID_PROPERTY);
6842     // OpenCL 1.2
6843     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6844     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6845     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6846     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6847     // OpenCL 2.0
6848     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6849     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6850     // Extensions
6851     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6852     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6853     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6854     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6855     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6856     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6857     default: return "Unknown OpenCL error";
6858     }
6859 #undef CV_OCL_CODE
6860 #undef CV_OCL_CODE_
6861 }
6862
6863 template <typename T>
6864 static std::string kerToStr(const Mat & k)
6865 {
6866     int width = k.cols - 1, depth = k.depth();
6867     const T * const data = k.ptr<T>();
6868
6869     std::ostringstream stream;
6870     stream.precision(10);
6871
6872     if (depth <= CV_8S)
6873     {
6874         for (int i = 0; i < width; ++i)
6875             stream << "DIG(" << (int)data[i] << ")";
6876         stream << "DIG(" << (int)data[width] << ")";
6877     }
6878     else if (depth == CV_32F)
6879     {
6880         stream.setf(std::ios_base::showpoint);
6881         for (int i = 0; i < width; ++i)
6882             stream << "DIG(" << data[i] << "f)";
6883         stream << "DIG(" << data[width] << "f)";
6884     }
6885     else
6886     {
6887         for (int i = 0; i < width; ++i)
6888             stream << "DIG(" << data[i] << ")";
6889         stream << "DIG(" << data[width] << ")";
6890     }
6891
6892     return stream.str();
6893 }
6894
6895 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6896 {
6897     Mat kernel = _kernel.getMat().reshape(1, 1);
6898
6899     int depth = kernel.depth();
6900     if (ddepth < 0)
6901         ddepth = depth;
6902
6903     if (ddepth != depth)
6904         kernel.convertTo(kernel, ddepth);
6905
6906     typedef std::string (* func_t)(const Mat &);
6907     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6908                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6909     const func_t func = funcs[ddepth];
6910     CV_Assert(func != 0);
6911
6912     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6913 }
6914
6915 #define PROCESS_SRC(src) \
6916     do \
6917     { \
6918         if (!src.empty()) \
6919         { \
6920             CV_Assert(src.isMat() || src.isUMat()); \
6921             Size csize = src.size(); \
6922             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6923                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6924             if (cwidth < ckercn || ckercn <= 0) \
6925                 return 1; \
6926             cols.push_back(cwidth); \
6927             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6928                 return 1; \
6929             offsets.push_back(src.offset()); \
6930             steps.push_back(src.step()); \
6931             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6932             kercns.push_back(ckercn); \
6933         } \
6934     } \
6935     while ((void)0, 0)
6936
6937 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6938                               InputArray src4, InputArray src5, InputArray src6,
6939                               InputArray src7, InputArray src8, InputArray src9,
6940                               OclVectorStrategy strat)
6941 {
6942     const ocl::Device & d = ocl::Device::getDefault();
6943
6944     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6945         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6946         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6947         d.preferredVectorWidthDouble(), -1 };
6948
6949     // if the device says don't use vectors
6950     if (vectorWidths[0] == 1)
6951     {
6952         // it's heuristic
6953         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6954         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6955         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6956     }
6957
6958     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6959 }
6960
6961 int checkOptimalVectorWidth(const int *vectorWidths,
6962                             InputArray src1, InputArray src2, InputArray src3,
6963                             InputArray src4, InputArray src5, InputArray src6,
6964                             InputArray src7, InputArray src8, InputArray src9,
6965                             OclVectorStrategy strat)
6966 {
6967     CV_Assert(vectorWidths);
6968
6969     int ref_type = src1.type();
6970
6971     std::vector<size_t> offsets, steps, cols;
6972     std::vector<int> dividers, kercns;
6973     PROCESS_SRC(src1);
6974     PROCESS_SRC(src2);
6975     PROCESS_SRC(src3);
6976     PROCESS_SRC(src4);
6977     PROCESS_SRC(src5);
6978     PROCESS_SRC(src6);
6979     PROCESS_SRC(src7);
6980     PROCESS_SRC(src8);
6981     PROCESS_SRC(src9);
6982
6983     size_t size = offsets.size();
6984
6985     for (size_t i = 0; i < size; ++i)
6986         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6987             dividers[i] >>= 1, kercns[i] >>= 1;
6988
6989     // default strategy
6990     int kercn = *std::min_element(kercns.begin(), kercns.end());
6991
6992     return kercn;
6993 }
6994
6995 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6996                                  InputArray src4, InputArray src5, InputArray src6,
6997                                  InputArray src7, InputArray src8, InputArray src9)
6998 {
6999     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
7000 }
7001
7002 #undef PROCESS_SRC
7003
7004
7005 // TODO Make this as a method of OpenCL "BuildOptions" class
7006 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
7007 {
7008     if (!buildOptions.empty())
7009         buildOptions += " ";
7010     int type = _m.type(), depth = CV_MAT_DEPTH(type);
7011     buildOptions += format(
7012             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
7013             name.c_str(), ocl::typeToStr(type),
7014             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
7015             name.c_str(), (int)CV_MAT_CN(type),
7016             name.c_str(), (int)CV_ELEM_SIZE(type),
7017             name.c_str(), (int)CV_ELEM_SIZE1(type),
7018             name.c_str(), (int)depth
7019             );
7020 }
7021
7022
7023 struct Image2D::Impl
7024 {
7025     Impl(const UMat &src, bool norm, bool alias)
7026     {
7027         handle = 0;
7028         refcount = 1;
7029         init(src, norm, alias);
7030     }
7031
7032     ~Impl()
7033     {
7034         if (handle)
7035             clReleaseMemObject(handle);
7036     }
7037
7038     static cl_image_format getImageFormat(int depth, int cn, bool norm)
7039     {
7040         cl_image_format format;
7041         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
7042                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
7043         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
7044                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
7045         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
7046
7047         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
7048         int channelOrder = channelOrders[cn];
7049         format.image_channel_data_type = (cl_channel_type)channelType;
7050         format.image_channel_order = (cl_channel_order)channelOrder;
7051         return format;
7052     }
7053
7054     static bool isFormatSupported(cl_image_format format)
7055     {
7056         if (!haveOpenCL())
7057             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7058
7059         cl_context context = (cl_context)Context::getDefault().ptr();
7060         if (!context)
7061             return false;
7062
7063         // Figure out how many formats are supported by this context.
7064         cl_uint numFormats = 0;
7065         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7066                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
7067                                                 NULL, &numFormats);
7068         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7069         if (numFormats > 0)
7070         {
7071             AutoBuffer<cl_image_format> formats(numFormats);
7072             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7073                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
7074                                              formats.data(), NULL);
7075             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7076             for (cl_uint i = 0; i < numFormats; ++i)
7077             {
7078                 if (!memcmp(&formats[i], &format, sizeof(format)))
7079                 {
7080                     return true;
7081                 }
7082             }
7083         }
7084         return false;
7085     }
7086
7087     void init(const UMat &src, bool norm, bool alias)
7088     {
7089         if (!haveOpenCL())
7090             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7091
7092         CV_Assert(!src.empty());
7093         CV_Assert(ocl::Device::getDefault().imageSupport());
7094
7095         int err, depth = src.depth(), cn = src.channels();
7096         CV_Assert(cn <= 4);
7097         cl_image_format format = getImageFormat(depth, cn, norm);
7098
7099         if (!isFormatSupported(format))
7100             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7101
7102         if (alias && !src.handle(ACCESS_RW))
7103             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7104
7105         cl_context context = (cl_context)Context::getDefault().ptr();
7106         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7107
7108 #ifdef CL_VERSION_1_2
7109         // this enables backwards portability to
7110         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7111         const Device & d = ocl::Device::getDefault();
7112         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7113         CV_Assert(!alias || canCreateAlias(src));
7114         if (1 < major || (1 == major && 2 <= minor))
7115         {
7116             cl_image_desc desc;
7117             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7118             desc.image_width      = src.cols;
7119             desc.image_height     = src.rows;
7120             desc.image_depth      = 0;
7121             desc.image_array_size = 1;
7122             desc.image_row_pitch  = alias ? src.step[0] : 0;
7123             desc.image_slice_pitch = 0;
7124             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7125             desc.num_mip_levels   = 0;
7126             desc.num_samples      = 0;
7127             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7128         }
7129         else
7130 #endif
7131         {
7132             CV_SUPPRESS_DEPRECATED_START
7133             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7134             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7135             CV_SUPPRESS_DEPRECATED_END
7136         }
7137         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7138
7139         size_t origin[] = { 0, 0, 0 };
7140         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7141
7142         cl_mem devData;
7143         if (!alias && !src.isContinuous())
7144         {
7145             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7146             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7147                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7148                 ).c_str());
7149
7150             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7151             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7152                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7153             CV_OCL_DBG_CHECK(clFlush(queue));
7154         }
7155         else
7156         {
7157             devData = (cl_mem)src.handle(ACCESS_READ);
7158         }
7159         CV_Assert(devData != NULL);
7160
7161         if (!alias)
7162         {
7163             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7164             if (!src.isContinuous())
7165             {
7166                 CV_OCL_DBG_CHECK(clFlush(queue));
7167                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7168             }
7169         }
7170     }
7171
7172     IMPLEMENT_REFCOUNTABLE();
7173
7174     cl_mem handle;
7175 };
7176
7177 Image2D::Image2D()
7178 {
7179     p = NULL;
7180 }
7181
7182 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7183 {
7184     p = new Impl(src, norm, alias);
7185 }
7186
7187 bool Image2D::canCreateAlias(const UMat &m)
7188 {
7189     bool ret = false;
7190     const Device & d = ocl::Device::getDefault();
7191     if (d.imageFromBufferSupport() && !m.empty())
7192     {
7193         // This is the required pitch alignment in pixels
7194         uint pitchAlign = d.imagePitchAlignment();
7195         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7196         {
7197             // We don't currently handle the case where the buffer was created
7198             // with CL_MEM_USE_HOST_PTR
7199             if (!m.u->tempUMat())
7200             {
7201                 ret = true;
7202             }
7203         }
7204     }
7205     return ret;
7206 }
7207
7208 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7209 {
7210     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7211
7212     return Impl::isFormatSupported(format);
7213 }
7214
7215 Image2D::Image2D(const Image2D & i)
7216 {
7217     p = i.p;
7218     if (p)
7219         p->addref();
7220 }
7221
7222 Image2D & Image2D::operator = (const Image2D & i)
7223 {
7224     if (i.p != p)
7225     {
7226         if (i.p)
7227             i.p->addref();
7228         if (p)
7229             p->release();
7230         p = i.p;
7231     }
7232     return *this;
7233 }
7234
7235 Image2D::~Image2D()
7236 {
7237     if (p)
7238         p->release();
7239 }
7240
7241 void* Image2D::ptr() const
7242 {
7243     return p ? p->handle : 0;
7244 }
7245
7246 bool internal::isOpenCLForced()
7247 {
7248     static bool initialized = false;
7249     static bool value = false;
7250     if (!initialized)
7251     {
7252         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7253         initialized = true;
7254     }
7255     return value;
7256 }
7257
7258 bool internal::isPerformanceCheckBypassed()
7259 {
7260     static bool initialized = false;
7261     static bool value = false;
7262     if (!initialized)
7263     {
7264         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7265         initialized = true;
7266     }
7267     return value;
7268 }
7269
7270 bool internal::isCLBuffer(UMat& u)
7271 {
7272     void* h = u.handle(ACCESS_RW);
7273     if (!h)
7274         return true;
7275     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7276 #if 1
7277     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7278         return false;
7279 #else
7280     cl_mem_object_type type = 0;
7281     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7282     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7283         return false;
7284 #endif
7285     return true;
7286 }
7287
7288 struct Timer::Impl
7289 {
7290     const Queue queue;
7291
7292     Impl(const Queue& q)
7293         : queue(q)
7294     {
7295     }
7296
7297     ~Impl(){}
7298
7299     void start()
7300     {
7301         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7302         timer.start();
7303     }
7304
7305     void stop()
7306     {
7307         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7308         timer.stop();
7309     }
7310
7311     uint64 durationNS() const
7312     {
7313         return (uint64)(timer.getTimeSec() * 1e9);
7314     }
7315
7316     TickMeter timer;
7317 };
7318
7319 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7320 Timer::~Timer() { delete p; }
7321
7322 void Timer::start()
7323 {
7324     CV_Assert(p);
7325     p->start();
7326 }
7327
7328 void Timer::stop()
7329 {
7330     CV_Assert(p);
7331     p->stop();
7332 }
7333
7334 uint64 Timer::durationNS() const
7335 {
7336     CV_Assert(p);
7337     return p->durationNS();
7338 }
7339
7340 }} // namespace
7341
7342 #ifdef HAVE_DIRECTX
7343 namespace cv { namespace directx { namespace internal {
7344 OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
7345 {
7346     ocl::Context::Impl* i = ctx.getImpl();
7347     CV_Assert(i);
7348     return i->getDirectXImpl();
7349 }
7350 }}} // namespace cv::directx::internal
7351 #endif
7352
7353 #endif // HAVE_OPENCL