Merge pull request #18362 from alalek:ocl_async_kernel_reschedule_bug
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832 // true if we have initialized OpenCL subsystem with available platforms
833 static bool g_isOpenCVActivated = false;
834
835 bool haveOpenCL()
836 {
837     CV_TRACE_FUNCTION();
838     static bool g_isOpenCLInitialized = false;
839     static bool g_isOpenCLAvailable = false;
840
841     if (!g_isOpenCLInitialized)
842     {
843         CV_TRACE_REGION("Init_OpenCL_Runtime");
844         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
845         if (envPath)
846         {
847             if (cv::String(envPath) == "disabled")
848             {
849                 g_isOpenCLAvailable = false;
850                 g_isOpenCLInitialized = true;
851             }
852         }
853         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
854         try
855         {
856             cl_uint n = 0;
857             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
858             g_isOpenCVActivated = n > 0;
859         }
860         catch (...)
861         {
862             g_isOpenCLAvailable = false;
863         }
864         g_isOpenCLInitialized = true;
865     }
866     return g_isOpenCLAvailable;
867 }
868
869 bool useOpenCL()
870 {
871     CoreTLSData& data = getCoreTlsData();
872     if (data.useOpenCL < 0)
873     {
874         try
875         {
876             data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
877         }
878         catch (...)
879         {
880             data.useOpenCL = 0;
881         }
882     }
883     return data.useOpenCL > 0;
884 }
885
886 bool isOpenCLActivated()
887 {
888     if (!g_isOpenCVActivated)
889         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
890     return useOpenCL();
891 }
892
893 void setUseOpenCL(bool flag)
894 {
895     CV_TRACE_FUNCTION();
896
897     CoreTLSData& data = getCoreTlsData();
898     if (!flag)
899     {
900         data.useOpenCL = 0;
901     }
902     else if( haveOpenCL() )
903     {
904         data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
905     }
906 }
907
908 #ifdef HAVE_CLAMDBLAS
909
910 class AmdBlasHelper
911 {
912 public:
913     static AmdBlasHelper & getInstance()
914     {
915         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
916     }
917
918     bool isAvailable() const
919     {
920         return g_isAmdBlasAvailable;
921     }
922
923     ~AmdBlasHelper()
924     {
925         try
926         {
927             clAmdBlasTeardown();
928         }
929         catch (...) { }
930     }
931
932 protected:
933     AmdBlasHelper()
934     {
935         if (!g_isAmdBlasInitialized)
936         {
937             AutoLock lock(getInitializationMutex());
938
939             if (!g_isAmdBlasInitialized)
940             {
941                 if (haveOpenCL())
942                 {
943                     try
944                     {
945                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
946                     }
947                     catch (...)
948                     {
949                         g_isAmdBlasAvailable = false;
950                     }
951                 }
952                 else
953                     g_isAmdBlasAvailable = false;
954
955                 g_isAmdBlasInitialized = true;
956             }
957         }
958     }
959
960 private:
961     static bool g_isAmdBlasInitialized;
962     static bool g_isAmdBlasAvailable;
963 };
964
965 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
966 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
967
968 bool haveAmdBlas()
969 {
970     return AmdBlasHelper::getInstance().isAvailable();
971 }
972
973 #else
974
975 bool haveAmdBlas()
976 {
977     return false;
978 }
979
980 #endif
981
982 #ifdef HAVE_CLAMDFFT
983
984 class AmdFftHelper
985 {
986 public:
987     static AmdFftHelper & getInstance()
988     {
989         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
990     }
991
992     bool isAvailable() const
993     {
994         return g_isAmdFftAvailable;
995     }
996
997     ~AmdFftHelper()
998     {
999         try
1000         {
1001 //            clAmdFftTeardown();
1002         }
1003         catch (...) { }
1004     }
1005
1006 protected:
1007     AmdFftHelper()
1008     {
1009         if (!g_isAmdFftInitialized)
1010         {
1011             AutoLock lock(getInitializationMutex());
1012
1013             if (!g_isAmdFftInitialized)
1014             {
1015                 if (haveOpenCL())
1016                 {
1017                     try
1018                     {
1019                         cl_uint major, minor, patch;
1020                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1021
1022                         // it throws exception in case AmdFft binaries are not found
1023                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1024                         g_isAmdFftAvailable = true;
1025                     }
1026                     catch (const Exception &)
1027                     {
1028                         g_isAmdFftAvailable = false;
1029                     }
1030                 }
1031                 else
1032                     g_isAmdFftAvailable = false;
1033
1034                 g_isAmdFftInitialized = true;
1035             }
1036         }
1037     }
1038
1039 private:
1040     static clAmdFftSetupData setupData;
1041     static bool g_isAmdFftInitialized;
1042     static bool g_isAmdFftAvailable;
1043 };
1044
1045 clAmdFftSetupData AmdFftHelper::setupData;
1046 bool AmdFftHelper::g_isAmdFftAvailable = false;
1047 bool AmdFftHelper::g_isAmdFftInitialized = false;
1048
1049 bool haveAmdFft()
1050 {
1051     return AmdFftHelper::getInstance().isAvailable();
1052 }
1053
1054 #else
1055
1056 bool haveAmdFft()
1057 {
1058     return false;
1059 }
1060
1061 #endif
1062
1063 bool haveSVM()
1064 {
1065 #ifdef HAVE_OPENCL_SVM
1066     return true;
1067 #else
1068     return false;
1069 #endif
1070 }
1071
1072 void finish()
1073 {
1074     Queue::getDefault().finish();
1075 }
1076
1077 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1078
1079 struct Platform::Impl
1080 {
1081     Impl()
1082     {
1083         refcount = 1;
1084         handle = 0;
1085         initialized = false;
1086     }
1087
1088     ~Impl() {}
1089
1090     void init()
1091     {
1092         if( !initialized )
1093         {
1094             //cl_uint num_entries
1095             cl_uint n = 0;
1096             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1097                 handle = 0;
1098             if( handle != 0 )
1099             {
1100                 char buf[1000];
1101                 size_t len = 0;
1102                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1103                 buf[len] = '\0';
1104                 vendor = String(buf);
1105             }
1106
1107             initialized = true;
1108         }
1109     }
1110
1111     IMPLEMENT_REFCOUNTABLE();
1112
1113     cl_platform_id handle;
1114     String vendor;
1115     bool initialized;
1116 };
1117
1118 Platform::Platform()
1119 {
1120     p = 0;
1121 }
1122
1123 Platform::~Platform()
1124 {
1125     if(p)
1126         p->release();
1127 }
1128
1129 Platform::Platform(const Platform& pl)
1130 {
1131     p = (Impl*)pl.p;
1132     if(p)
1133         p->addref();
1134 }
1135
1136 Platform& Platform::operator = (const Platform& pl)
1137 {
1138     Impl* newp = (Impl*)pl.p;
1139     if(newp)
1140         newp->addref();
1141     if(p)
1142         p->release();
1143     p = newp;
1144     return *this;
1145 }
1146
1147 void* Platform::ptr() const
1148 {
1149     return p ? p->handle : 0;
1150 }
1151
1152 Platform& Platform::getDefault()
1153 {
1154     static Platform p;
1155     if( !p.p )
1156     {
1157         p.p = new Impl;
1158         p.p->init();
1159     }
1160     return p;
1161 }
1162
1163 /////////////////////////////////////// Device ////////////////////////////////////////////
1164
1165 // deviceVersion has format
1166 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1167 // by specification
1168 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1169 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1170 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1171 {
1172     major = minor = 0;
1173     if (10 >= deviceVersion.length())
1174         return;
1175     const char *pstr = deviceVersion.c_str();
1176     if (0 != strncmp(pstr, "OpenCL ", 7))
1177         return;
1178     size_t ppos = deviceVersion.find('.', 7);
1179     if (String::npos == ppos)
1180         return;
1181     String temp = deviceVersion.substr(7, ppos - 7);
1182     major = atoi(temp.c_str());
1183     temp = deviceVersion.substr(ppos + 1);
1184     minor = atoi(temp.c_str());
1185 }
1186
1187 struct Device::Impl
1188 {
1189     Impl(void* d)
1190     {
1191         handle = (cl_device_id)d;
1192         refcount = 1;
1193
1194         name_ = getStrProp(CL_DEVICE_NAME);
1195         version_ = getStrProp(CL_DEVICE_VERSION);
1196         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1197         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1198         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1199         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1200         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1201         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1202         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1203         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1204
1205         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1206         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1207
1208         size_t pos = 0;
1209         while (pos < extensions_.size())
1210         {
1211             size_t pos2 = extensions_.find(' ', pos);
1212             if (pos2 == String::npos)
1213                 pos2 = extensions_.size();
1214             if (pos2 > pos)
1215             {
1216                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1217                 extensions_set_.insert(extensionName);
1218             }
1219             pos = pos2 + 1;
1220         }
1221
1222         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1223
1224         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1225         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1226             vendorName_ == "AMD")
1227             vendorID_ = VENDOR_AMD;
1228         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1229             vendorID_ = VENDOR_INTEL;
1230         else if (vendorName_ == "NVIDIA Corporation")
1231             vendorID_ = VENDOR_NVIDIA;
1232         else
1233             vendorID_ = UNKNOWN_VENDOR;
1234
1235         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1236         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1237         {
1238             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1239             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1240                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1241             maxWorkGroupSize_ = new_maxWorkGroupSize;
1242         }
1243 #if 0
1244         if (isExtensionSupported("cl_khr_spir"))
1245         {
1246 #ifndef CL_DEVICE_SPIR_VERSIONS
1247 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1248 #endif
1249             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1250             std::cout << spir_versions << std::endl;
1251         }
1252 #endif
1253     }
1254
1255     template<typename _TpCL, typename _TpOut>
1256     _TpOut getProp(cl_device_info prop) const
1257     {
1258         _TpCL temp=_TpCL();
1259         size_t sz = 0;
1260
1261         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1262             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1263     }
1264
1265     bool getBoolProp(cl_device_info prop) const
1266     {
1267         cl_bool temp = CL_FALSE;
1268         size_t sz = 0;
1269
1270         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1271             sz == sizeof(temp) ? temp != 0 : false;
1272     }
1273
1274     String getStrProp(cl_device_info prop) const
1275     {
1276         char buf[4096];
1277         size_t sz=0;
1278         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1279             sz < sizeof(buf) ? String(buf) : String();
1280     }
1281
1282     bool isExtensionSupported(const std::string& extensionName) const
1283     {
1284         return extensions_set_.count(extensionName) > 0;
1285     }
1286
1287
1288     IMPLEMENT_REFCOUNTABLE();
1289
1290     cl_device_id handle;
1291
1292     String name_;
1293     String version_;
1294     std::string extensions_;
1295     int doubleFPConfig_;
1296     bool hostUnifiedMemory_;
1297     int maxComputeUnits_;
1298     size_t maxWorkGroupSize_;
1299     int type_;
1300     int addressBits_;
1301     int deviceVersionMajor_;
1302     int deviceVersionMinor_;
1303     String driverVersion_;
1304     String vendorName_;
1305     int vendorID_;
1306     bool intelSubgroupsSupport_;
1307
1308     std::set<std::string> extensions_set_;
1309 };
1310
1311
1312 Device::Device()
1313 {
1314     p = 0;
1315 }
1316
1317 Device::Device(void* d)
1318 {
1319     p = 0;
1320     set(d);
1321 }
1322
1323 Device::Device(const Device& d)
1324 {
1325     p = d.p;
1326     if(p)
1327         p->addref();
1328 }
1329
1330 Device& Device::operator = (const Device& d)
1331 {
1332     Impl* newp = (Impl*)d.p;
1333     if(newp)
1334         newp->addref();
1335     if(p)
1336         p->release();
1337     p = newp;
1338     return *this;
1339 }
1340
1341 Device::~Device()
1342 {
1343     if(p)
1344         p->release();
1345 }
1346
1347 void Device::set(void* d)
1348 {
1349     if(p)
1350         p->release();
1351     p = new Impl(d);
1352 }
1353
1354 void* Device::ptr() const
1355 {
1356     return p ? p->handle : 0;
1357 }
1358
1359 String Device::name() const
1360 { return p ? p->name_ : String(); }
1361
1362 String Device::extensions() const
1363 { return p ? String(p->extensions_) : String(); }
1364
1365 bool Device::isExtensionSupported(const String& extensionName) const
1366 { return p ? p->isExtensionSupported(extensionName) : false; }
1367
1368 String Device::version() const
1369 { return p ? p->version_ : String(); }
1370
1371 String Device::vendorName() const
1372 { return p ? p->vendorName_ : String(); }
1373
1374 int Device::vendorID() const
1375 { return p ? p->vendorID_ : 0; }
1376
1377 String Device::OpenCL_C_Version() const
1378 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1379
1380 String Device::OpenCLVersion() const
1381 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1382
1383 int Device::deviceVersionMajor() const
1384 { return p ? p->deviceVersionMajor_ : 0; }
1385
1386 int Device::deviceVersionMinor() const
1387 { return p ? p->deviceVersionMinor_ : 0; }
1388
1389 String Device::driverVersion() const
1390 { return p ? p->driverVersion_ : String(); }
1391
1392 int Device::type() const
1393 { return p ? p->type_ : 0; }
1394
1395 int Device::addressBits() const
1396 { return p ? p->addressBits_ : 0; }
1397
1398 bool Device::available() const
1399 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1400
1401 bool Device::compilerAvailable() const
1402 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1403
1404 bool Device::linkerAvailable() const
1405 #ifdef CL_VERSION_1_2
1406 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1407 #else
1408 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1409 #endif
1410
1411 int Device::doubleFPConfig() const
1412 { return p ? p->doubleFPConfig_ : 0; }
1413
1414 int Device::singleFPConfig() const
1415 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1416
1417 int Device::halfFPConfig() const
1418 #ifdef CL_VERSION_1_2
1419 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1420 #else
1421 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1422 #endif
1423
1424 bool Device::endianLittle() const
1425 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1426
1427 bool Device::errorCorrectionSupport() const
1428 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1429
1430 int Device::executionCapabilities() const
1431 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1432
1433 size_t Device::globalMemCacheSize() const
1434 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1435
1436 int Device::globalMemCacheType() const
1437 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1438
1439 int Device::globalMemCacheLineSize() const
1440 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1441
1442 size_t Device::globalMemSize() const
1443 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1444
1445 size_t Device::localMemSize() const
1446 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1447
1448 int Device::localMemType() const
1449 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1450
1451 bool Device::hostUnifiedMemory() const
1452 { return p ? p->hostUnifiedMemory_ : false; }
1453
1454 bool Device::imageSupport() const
1455 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1456
1457 bool Device::imageFromBufferSupport() const
1458 {
1459     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1460 }
1461
1462 uint Device::imagePitchAlignment() const
1463 {
1464 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1465     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1466 #else
1467     return 0;
1468 #endif
1469 }
1470
1471 uint Device::imageBaseAddressAlignment() const
1472 {
1473 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1474     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1475 #else
1476     return 0;
1477 #endif
1478 }
1479
1480 size_t Device::image2DMaxWidth() const
1481 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1482
1483 size_t Device::image2DMaxHeight() const
1484 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1485
1486 size_t Device::image3DMaxWidth() const
1487 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1488
1489 size_t Device::image3DMaxHeight() const
1490 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1491
1492 size_t Device::image3DMaxDepth() const
1493 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1494
1495 size_t Device::imageMaxBufferSize() const
1496 #ifdef CL_VERSION_1_2
1497 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1498 #else
1499 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1500 #endif
1501
1502 size_t Device::imageMaxArraySize() const
1503 #ifdef CL_VERSION_1_2
1504 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1505 #else
1506 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1507 #endif
1508
1509 bool Device::intelSubgroupsSupport() const
1510 { return p ? p->intelSubgroupsSupport_ : false; }
1511
1512 int Device::maxClockFrequency() const
1513 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1514
1515 int Device::maxComputeUnits() const
1516 { return p ? p->maxComputeUnits_ : 0; }
1517
1518 int Device::maxConstantArgs() const
1519 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1520
1521 size_t Device::maxConstantBufferSize() const
1522 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1523
1524 size_t Device::maxMemAllocSize() const
1525 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1526
1527 size_t Device::maxParameterSize() const
1528 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1529
1530 int Device::maxReadImageArgs() const
1531 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1532
1533 int Device::maxWriteImageArgs() const
1534 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1535
1536 int Device::maxSamplers() const
1537 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1538
1539 size_t Device::maxWorkGroupSize() const
1540 { return p ? p->maxWorkGroupSize_ : 0; }
1541
1542 int Device::maxWorkItemDims() const
1543 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1544
1545 void Device::maxWorkItemSizes(size_t* sizes) const
1546 {
1547     if(p)
1548     {
1549         const int MAX_DIMS = 32;
1550         size_t retsz = 0;
1551         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1552                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1553     }
1554 }
1555
1556 int Device::memBaseAddrAlign() const
1557 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1558
1559 int Device::nativeVectorWidthChar() const
1560 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1561
1562 int Device::nativeVectorWidthShort() const
1563 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1564
1565 int Device::nativeVectorWidthInt() const
1566 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1567
1568 int Device::nativeVectorWidthLong() const
1569 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1570
1571 int Device::nativeVectorWidthFloat() const
1572 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1573
1574 int Device::nativeVectorWidthDouble() const
1575 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1576
1577 int Device::nativeVectorWidthHalf() const
1578 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1579
1580 int Device::preferredVectorWidthChar() const
1581 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1582
1583 int Device::preferredVectorWidthShort() const
1584 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1585
1586 int Device::preferredVectorWidthInt() const
1587 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1588
1589 int Device::preferredVectorWidthLong() const
1590 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1591
1592 int Device::preferredVectorWidthFloat() const
1593 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1594
1595 int Device::preferredVectorWidthDouble() const
1596 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1597
1598 int Device::preferredVectorWidthHalf() const
1599 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1600
1601 size_t Device::printfBufferSize() const
1602 #ifdef CL_VERSION_1_2
1603 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1604 #else
1605 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1606 #endif
1607
1608
1609 size_t Device::profilingTimerResolution() const
1610 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1611
1612 const Device& Device::getDefault()
1613 {
1614     const Context& ctx = Context::getDefault();
1615     int idx = getCoreTlsData().device;
1616     const Device& device = ctx.device(idx);
1617     return device;
1618 }
1619
1620 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1621
1622 template <typename Functor, typename ObjectType>
1623 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
1624 {
1625     ::size_t required;
1626     cl_int err = f(obj, name, 0, NULL, &required);
1627     if (err != CL_SUCCESS)
1628         return err;
1629
1630     param.clear();
1631     if (required > 0)
1632     {
1633         AutoBuffer<char> buf(required + 1);
1634         char* ptr = buf.data(); // cleanup is not needed
1635         err = f(obj, name, required, ptr, NULL);
1636         if (err != CL_SUCCESS)
1637             return err;
1638         param = ptr;
1639     }
1640
1641     return CL_SUCCESS;
1642 }
1643
1644 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
1645 {
1646     elems.clear();
1647     if (s.size() == 0)
1648         return;
1649     std::istringstream ss(s);
1650     std::string item;
1651     while (!ss.eof())
1652     {
1653         std::getline(ss, item, delim);
1654         elems.push_back(item);
1655     }
1656 }
1657
1658 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
1659 // Sample: AMD:GPU:
1660 // Sample: AMD:GPU:Tahiti
1661 // Sample: :GPU|CPU: = '' = ':' = '::'
1662 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
1663         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
1664 {
1665     std::vector<std::string> parts;
1666     split(configurationStr, ':', parts);
1667     if (parts.size() > 3)
1668     {
1669         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
1670         return false;
1671     }
1672     if (parts.size() > 2)
1673         deviceNameOrID = parts[2];
1674     if (parts.size() > 1)
1675     {
1676         split(parts[1], '|', deviceTypes);
1677     }
1678     if (parts.size() > 0)
1679     {
1680         platform = parts[0];
1681     }
1682     return true;
1683 }
1684
1685 #if defined WINRT || defined _WIN32_WCE
1686 static cl_device_id selectOpenCLDevice()
1687 {
1688     return NULL;
1689 }
1690 #else
1691 // std::tolower is int->int
1692 static char char_tolower(char ch)
1693 {
1694     return (char)std::tolower((int)ch);
1695 }
1696 static cl_device_id selectOpenCLDevice()
1697 {
1698     std::string platform, deviceName;
1699     std::vector<std::string> deviceTypes;
1700
1701     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
1702     if (configuration &&
1703             (strcmp(configuration, "disabled") == 0 ||
1704              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
1705             ))
1706         return NULL;
1707
1708     bool isID = false;
1709     int deviceID = -1;
1710     if (deviceName.length() == 1)
1711     // We limit ID range to 0..9, because we want to write:
1712     // - '2500' to mean i5-2500
1713     // - '8350' to mean AMD FX-8350
1714     // - '650' to mean GeForce 650
1715     // To extend ID range change condition to '> 0'
1716     {
1717         isID = true;
1718         for (size_t i = 0; i < deviceName.length(); i++)
1719         {
1720             if (!isdigit(deviceName[i]))
1721             {
1722                 isID = false;
1723                 break;
1724             }
1725         }
1726         if (isID)
1727         {
1728             deviceID = atoi(deviceName.c_str());
1729             if (deviceID < 0)
1730                 return NULL;
1731         }
1732     }
1733
1734     std::vector<cl_platform_id> platforms;
1735     {
1736         cl_uint numPlatforms = 0;
1737         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
1738
1739         if (numPlatforms == 0)
1740             return NULL;
1741         platforms.resize((size_t)numPlatforms);
1742         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
1743         platforms.resize(numPlatforms);
1744     }
1745
1746     int selectedPlatform = -1;
1747     if (platform.length() > 0)
1748     {
1749         for (size_t i = 0; i < platforms.size(); i++)
1750         {
1751             std::string name;
1752             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
1753             if (name.find(platform) != std::string::npos)
1754             {
1755                 selectedPlatform = (int)i;
1756                 break;
1757             }
1758         }
1759         if (selectedPlatform == -1)
1760         {
1761             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
1762             goto not_found;
1763         }
1764     }
1765     if (deviceTypes.size() == 0)
1766     {
1767         if (!isID)
1768         {
1769             deviceTypes.push_back("GPU");
1770             if (configuration)
1771                 deviceTypes.push_back("CPU");
1772         }
1773         else
1774             deviceTypes.push_back("ALL");
1775     }
1776     for (size_t t = 0; t < deviceTypes.size(); t++)
1777     {
1778         int deviceType = 0;
1779         std::string tempStrDeviceType = deviceTypes[t];
1780         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
1781
1782         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1783             deviceType = Device::TYPE_GPU;
1784         else if (tempStrDeviceType == "cpu")
1785             deviceType = Device::TYPE_CPU;
1786         else if (tempStrDeviceType == "accelerator")
1787             deviceType = Device::TYPE_ACCELERATOR;
1788         else if (tempStrDeviceType == "all")
1789             deviceType = Device::TYPE_ALL;
1790         else
1791         {
1792             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
1793             goto not_found;
1794         }
1795
1796         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
1797         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
1798                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
1799                 i++)
1800         {
1801             cl_uint count = 0;
1802             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
1803             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1804             {
1805                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
1806             }
1807             if (count == 0)
1808                 continue;
1809             size_t base = devices.size();
1810             devices.resize(base + count);
1811             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
1812             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1813             {
1814                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
1815             }
1816         }
1817
1818         for (size_t i = (isID ? deviceID : 0);
1819              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
1820              i++)
1821         {
1822             std::string name;
1823             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
1824             cl_bool useGPU = true;
1825             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1826             {
1827                 cl_bool isIGPU = CL_FALSE;
1828                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
1829                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
1830             }
1831             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
1832             {
1833                 // TODO check for OpenCL 1.1
1834                 return devices[i];
1835             }
1836         }
1837     }
1838
1839 not_found:
1840     if (!configuration)
1841         return NULL; // suppress messages on stderr
1842
1843     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
1844             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
1845             << "    Device types: ";
1846     for (size_t t = 0; t < deviceTypes.size(); t++)
1847         std::cerr << deviceTypes[t] << " ";
1848
1849     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
1850     return NULL;
1851 }
1852 #endif
1853
1854 #ifdef HAVE_OPENCL_SVM
1855 namespace svm {
1856
1857 enum AllocatorFlags { // don't use first 16 bits
1858         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
1859         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
1860         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
1861         OPENCL_SVM_BUFFER_MASK = 3 << 16,
1862         OPENCL_SVM_BUFFER_MAP = 4 << 16
1863 };
1864
1865 static bool checkForceSVMUmatUsage()
1866 {
1867     static bool initialized = false;
1868     static bool force = false;
1869     if (!initialized)
1870     {
1871         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
1872         initialized = true;
1873     }
1874     return force;
1875 }
1876 static bool checkDisableSVMUMatUsage()
1877 {
1878     static bool initialized = false;
1879     static bool force = false;
1880     if (!initialized)
1881     {
1882         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
1883         initialized = true;
1884     }
1885     return force;
1886 }
1887 static bool checkDisableSVM()
1888 {
1889     static bool initialized = false;
1890     static bool force = false;
1891     if (!initialized)
1892     {
1893         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
1894         initialized = true;
1895     }
1896     return force;
1897 }
1898 // see SVMCapabilities
1899 static unsigned int getSVMCapabilitiesMask()
1900 {
1901     static bool initialized = false;
1902     static unsigned int mask = 0;
1903     if (!initialized)
1904     {
1905         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
1906         if (envValue == NULL)
1907         {
1908             return ~0U; // all bits 1
1909         }
1910         mask = atoi(envValue);
1911         initialized = true;
1912     }
1913     return mask;
1914 }
1915 } // namespace
1916 #endif
1917
1918 static size_t getProgramCountLimit()
1919 {
1920     static bool initialized = false;
1921     static size_t count = 0;
1922     if (!initialized)
1923     {
1924         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
1925         initialized = true;
1926     }
1927     return count;
1928 }
1929
1930 struct Context::Impl
1931 {
1932     static Context::Impl* get(Context& context) { return context.p; }
1933
1934     void __init()
1935     {
1936         refcount = 1;
1937         handle = 0;
1938 #ifdef HAVE_OPENCL_SVM
1939         svmInitialized = false;
1940 #endif
1941     }
1942
1943     Impl()
1944     {
1945         __init();
1946     }
1947
1948     void setDefault()
1949     {
1950         CV_Assert(handle == NULL);
1951
1952         cl_device_id d = selectOpenCLDevice();
1953
1954         if (d == NULL)
1955             return;
1956
1957         cl_platform_id pl = NULL;
1958         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
1959
1960         cl_context_properties prop[] =
1961         {
1962             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1963             0
1964         };
1965
1966         // !!! in the current implementation force the number of devices to 1 !!!
1967         cl_uint nd = 1;
1968         cl_int status;
1969
1970         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
1971         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
1972
1973         bool ok = handle != 0 && status == CL_SUCCESS;
1974         if( ok )
1975         {
1976             devices.resize(nd);
1977             devices[0].set(d);
1978         }
1979         else
1980             handle = NULL;
1981     }
1982
1983     Impl(int dtype0)
1984     {
1985         __init();
1986
1987         cl_int retval = 0;
1988         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
1989         cl_context_properties prop[] =
1990         {
1991             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1992             0
1993         };
1994
1995         cl_uint nd0 = 0;
1996         int dtype = dtype0 & 15;
1997         cl_int status = clGetDeviceIDs(pl, dtype, 0, NULL, &nd0);
1998         if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
1999         {
2000             CV_OCL_DBG_CHECK_RESULT(status,
2001                 cv::format("clGetDeviceIDs(platform=%p, device_type=%d, num_entries=0, devices=NULL, numDevices=%p)", pl, dtype, &nd0).c_str());
2002         }
2003
2004         if (nd0 == 0)
2005             return;
2006
2007         AutoBuffer<void*> dlistbuf(nd0*2+1);
2008         cl_device_id* dlist = (cl_device_id*)dlistbuf.data();
2009         cl_device_id* dlist_new = dlist + nd0;
2010         CV_OCL_DBG_CHECK(clGetDeviceIDs(pl, dtype, nd0, dlist, &nd0));
2011
2012         cl_uint i, nd = 0;
2013         String name0;
2014         for(i = 0; i < nd0; i++)
2015         {
2016             Device d(dlist[i]);
2017             if( !d.available() || !d.compilerAvailable() )
2018                 continue;
2019             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2020                 continue;
2021             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2022                 continue;
2023             String name = d.name();
2024             if( nd != 0 && name != name0 )
2025                 continue;
2026             name0 = name;
2027             dlist_new[nd++] = dlist[i];
2028         }
2029
2030         if(nd == 0)
2031             return;
2032
2033         // !!! in the current implementation force the number of devices to 1 !!!
2034         nd = 1;
2035
2036         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2037         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateContext");
2038         bool ok = handle != 0 && retval == CL_SUCCESS;
2039         if( ok )
2040         {
2041             devices.resize(nd);
2042             for( i = 0; i < nd; i++ )
2043                 devices[i].set(dlist_new[i]);
2044         }
2045     }
2046
2047     ~Impl()
2048     {
2049         if(handle)
2050         {
2051             CV_OCL_DBG_CHECK(clReleaseContext(handle));
2052             handle = NULL;
2053         }
2054         devices.clear();
2055     }
2056
2057     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2058
2059     void unloadProg(Program& prog)
2060     {
2061         cv::AutoLock lock(program_cache_mutex);
2062         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2063         {
2064               phash_t::iterator it = phash.find(*i);
2065               if (it != phash.end())
2066               {
2067                   if (it->second.ptr() == prog.ptr())
2068                   {
2069                       phash.erase(*i);
2070                       cacheList.erase(i);
2071                       return;
2072                   }
2073               }
2074         }
2075     }
2076
2077     std::string& getPrefixString()
2078     {
2079         if (prefix.empty())
2080         {
2081             cv::AutoLock lock(program_cache_mutex);
2082             if (prefix.empty())
2083             {
2084                 CV_Assert(!devices.empty());
2085                 const Device& d = devices[0];
2086                 int bits = d.addressBits();
2087                 if (bits > 0 && bits != 64)
2088                     prefix = cv::format("%d-bit--", bits);
2089                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2090                 // sanitize chars
2091                 for (size_t i = 0; i < prefix.size(); i++)
2092                 {
2093                     char c = prefix[i];
2094                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2095                     {
2096                         prefix[i] = '_';
2097                     }
2098                 }
2099             }
2100         }
2101         return prefix;
2102     }
2103
2104     std::string& getPrefixBase()
2105     {
2106         if (prefix_base.empty())
2107         {
2108             cv::AutoLock lock(program_cache_mutex);
2109             if (prefix_base.empty())
2110             {
2111                 const Device& d = devices[0];
2112                 int bits = d.addressBits();
2113                 if (bits > 0 && bits != 64)
2114                     prefix_base = cv::format("%d-bit--", bits);
2115                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2116                 // sanitize chars
2117                 for (size_t i = 0; i < prefix_base.size(); i++)
2118                 {
2119                     char c = prefix_base[i];
2120                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2121                     {
2122                         prefix_base[i] = '_';
2123                     }
2124                 }
2125             }
2126         }
2127         return prefix_base;
2128     }
2129
2130     IMPLEMENT_REFCOUNTABLE();
2131
2132     cl_context handle;
2133     std::vector<Device> devices;
2134
2135     std::string prefix;
2136     std::string prefix_base;
2137
2138     cv::Mutex program_cache_mutex;
2139     typedef std::map<std::string, Program> phash_t;
2140     phash_t phash;
2141     typedef std::list<cv::String> CacheList;
2142     CacheList cacheList;
2143
2144 #ifdef HAVE_OPENCL_SVM
2145     bool svmInitialized;
2146     bool svmAvailable;
2147     bool svmEnabled;
2148     svm::SVMCapabilities svmCapabilities;
2149     svm::SVMFunctions svmFunctions;
2150
2151     void svmInit()
2152     {
2153         CV_Assert(handle != NULL);
2154         const Device& device = devices[0];
2155         cl_device_svm_capabilities deviceCaps = 0;
2156         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2157         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2158         if (status != CL_SUCCESS)
2159         {
2160             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2161             goto noSVM;
2162         }
2163         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2164         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2165         svmCapabilities.value_ =
2166                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2167                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2168                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2169                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2170         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2171         if (svmCapabilities.value_ == 0)
2172         {
2173             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2174             goto noSVM;
2175         }
2176         try
2177         {
2178             // Try OpenCL 2.0
2179             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2180             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2181             if (!ptr)
2182             {
2183                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2184                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2185             }
2186             try
2187             {
2188                 bool error = false;
2189                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2190                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2191                 {
2192                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2193                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2194                 }
2195                 clFinish(q);
2196                 try
2197                 {
2198                     ((int*)ptr)[0] = 100;
2199                 }
2200                 catch (...)
2201                 {
2202                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2203                     error = true;
2204                 }
2205                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2206                 {
2207                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2208                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2209                 }
2210                 clFinish(q);
2211                 if (error)
2212                 {
2213                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2214                 }
2215             }
2216             catch (...)
2217             {
2218                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2219                 clSVMFree(handle, ptr);
2220                 throw;
2221             }
2222             clSVMFree(handle, ptr);
2223             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2224             svmFunctions.fn_clSVMFree = clSVMFree;
2225             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2226             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2227             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2228             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2229             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2230             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2231             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2232         }
2233         catch (...)
2234         {
2235             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2236             try
2237             {
2238                 // Try HSA extension
2239                 String extensions = device.extensions();
2240                 if (extensions.find("cl_amd_svm") == String::npos)
2241                 {
2242                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2243                     goto noSVM;
2244                 }
2245                 cl_platform_id p = NULL;
2246                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2247                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2248                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2249                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2250                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2251                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2252                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2253                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2254                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2255                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2256                 CV_Assert(svmFunctions.isValid());
2257             }
2258             catch (...)
2259             {
2260                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2261                 goto noSVM;
2262             }
2263         }
2264
2265         svmAvailable = true;
2266         svmEnabled = !svm::checkDisableSVM();
2267         svmInitialized = true;
2268         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2269         return;
2270     noSVM:
2271         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2272         svmAvailable = false;
2273         svmEnabled = false;
2274         svmCapabilities.value_ = 0;
2275         svmInitialized = true;
2276         svmFunctions.fn_clSVMAlloc = NULL;
2277         return;
2278     }
2279 #endif
2280
2281     friend class Program;
2282 };
2283
2284
2285 Context::Context()
2286 {
2287     p = 0;
2288 }
2289
2290 Context::Context(int dtype)
2291 {
2292     p = 0;
2293     create(dtype);
2294 }
2295
2296 bool Context::create()
2297 {
2298     if( !haveOpenCL() )
2299         return false;
2300     if(p)
2301         p->release();
2302     p = new Impl();
2303     if(!p->handle)
2304     {
2305         delete p;
2306         p = 0;
2307     }
2308     return p != 0;
2309 }
2310
2311 bool Context::create(int dtype0)
2312 {
2313     if( !haveOpenCL() )
2314         return false;
2315     if(p)
2316         p->release();
2317     p = new Impl(dtype0);
2318     if(!p->handle)
2319     {
2320         delete p;
2321         p = 0;
2322     }
2323     return p != 0;
2324 }
2325
2326 Context::~Context()
2327 {
2328     if (p)
2329     {
2330         p->release();
2331         p = NULL;
2332     }
2333 }
2334
2335 Context::Context(const Context& c)
2336 {
2337     p = (Impl*)c.p;
2338     if(p)
2339         p->addref();
2340 }
2341
2342 Context& Context::operator = (const Context& c)
2343 {
2344     Impl* newp = (Impl*)c.p;
2345     if(newp)
2346         newp->addref();
2347     if(p)
2348         p->release();
2349     p = newp;
2350     return *this;
2351 }
2352
2353 void* Context::ptr() const
2354 {
2355     return p == NULL ? NULL : p->handle;
2356 }
2357
2358 size_t Context::ndevices() const
2359 {
2360     return p ? p->devices.size() : 0;
2361 }
2362
2363 const Device& Context::device(size_t idx) const
2364 {
2365     static Device dummy;
2366     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2367 }
2368
2369 Context& Context::getDefault(bool initialize)
2370 {
2371     static Context* ctx = new Context();
2372     if(!ctx->p && haveOpenCL())
2373     {
2374         if (!ctx->p)
2375             ctx->p = new Impl();
2376         if (initialize)
2377         {
2378             // do not create new Context right away.
2379             // First, try to retrieve existing context of the same type.
2380             // In its turn, Platform::getContext() may call Context::create()
2381             // if there is no such context.
2382             if (ctx->p->handle == NULL)
2383                 ctx->p->setDefault();
2384         }
2385     }
2386
2387     return *ctx;
2388 }
2389
2390 Program Context::getProg(const ProgramSource& prog,
2391                          const String& buildopts, String& errmsg)
2392 {
2393     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2394 }
2395
2396 void Context::unloadProg(Program& prog)
2397 {
2398     if (p)
2399         p->unloadProg(prog);
2400 }
2401
2402 #ifdef HAVE_OPENCL_SVM
2403 bool Context::useSVM() const
2404 {
2405     Context::Impl* i = p;
2406     CV_Assert(i);
2407     if (!i->svmInitialized)
2408         i->svmInit();
2409     return i->svmEnabled;
2410 }
2411 void Context::setUseSVM(bool enabled)
2412 {
2413     Context::Impl* i = p;
2414     CV_Assert(i);
2415     if (!i->svmInitialized)
2416         i->svmInit();
2417     if (enabled && !i->svmAvailable)
2418     {
2419         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2420     }
2421     i->svmEnabled = enabled;
2422 }
2423 #else
2424 bool Context::useSVM() const { return false; }
2425 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2426 #endif
2427
2428 #ifdef HAVE_OPENCL_SVM
2429 namespace svm {
2430
2431 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2432 {
2433     Context::Impl* i = context.p;
2434     CV_Assert(i);
2435     if (!i->svmInitialized)
2436         i->svmInit();
2437     return i->svmCapabilities;
2438 }
2439
2440 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2441 {
2442     Context::Impl* i = context.p;
2443     CV_Assert(i);
2444     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2445     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2446     return &i->svmFunctions;
2447 }
2448
2449 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2450 {
2451     if (checkForceSVMUmatUsage())
2452         return true;
2453     if (checkDisableSVMUMatUsage())
2454         return false;
2455     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2456         return true;
2457     return false; // don't use SVM by default
2458 }
2459
2460 } // namespace cv::ocl::svm
2461 #endif // HAVE_OPENCL_SVM
2462
2463
2464 static void get_platform_name(cl_platform_id id, String& name)
2465 {
2466     // get platform name string length
2467     size_t sz = 0;
2468     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
2469
2470     // get platform name string
2471     AutoBuffer<char> buf(sz + 1);
2472     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
2473
2474     // just in case, ensure trailing zero for ASCIIZ string
2475     buf[sz] = 0;
2476
2477     name = buf.data();
2478 }
2479
2480 /*
2481 // Attaches OpenCL context to OpenCV
2482 */
2483 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
2484 {
2485     cl_uint cnt = 0;
2486
2487     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
2488
2489     if (cnt == 0)
2490         CV_Error(cv::Error::OpenCLApiCallError, "no OpenCL platform available!");
2491
2492     std::vector<cl_platform_id> platforms(cnt);
2493
2494     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
2495
2496     bool platformAvailable = false;
2497
2498     // check if external platformName contained in list of available platforms in OpenCV
2499     for (unsigned int i = 0; i < cnt; i++)
2500     {
2501         String availablePlatformName;
2502         get_platform_name(platforms[i], availablePlatformName);
2503         // external platform is found in the list of available platforms
2504         if (platformName == availablePlatformName)
2505         {
2506             platformAvailable = true;
2507             break;
2508         }
2509     }
2510
2511     if (!platformAvailable)
2512         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2513
2514     // check if platformID corresponds to platformName
2515     String actualPlatformName;
2516     get_platform_name((cl_platform_id)platformID, actualPlatformName);
2517     if (platformName != actualPlatformName)
2518         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2519
2520     // do not initialize OpenCL context
2521     Context ctx = Context::getDefault(false);
2522
2523     // attach supplied context to OpenCV
2524     initializeContextFromHandle(ctx, platformID, context, deviceID);
2525
2526     CV_OCL_CHECK(clRetainContext((cl_context)context));
2527
2528     // clear command queue, if any
2529     CoreTLSData& data = getCoreTlsData();
2530     data.oclQueue.finish();
2531     Queue q;
2532     data.oclQueue = q;
2533
2534     return;
2535 } // attachContext()
2536
2537
2538 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2539 {
2540     cl_context context = (cl_context)_context;
2541     cl_device_id device = (cl_device_id)_device;
2542
2543     // cleanup old context
2544     Context::Impl * impl = ctx.p;
2545     if (impl->handle)
2546     {
2547         CV_OCL_DBG_CHECK(clReleaseContext(impl->handle));
2548     }
2549     impl->devices.clear();
2550
2551     impl->handle = context;
2552     impl->devices.resize(1);
2553     impl->devices[0].set(device);
2554
2555     Platform& p = Platform::getDefault();
2556     Platform::Impl* pImpl = p.p;
2557     pImpl->handle = (cl_platform_id)platform;
2558 }
2559
2560 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2561
2562 struct Queue::Impl
2563 {
2564     inline void __init()
2565     {
2566         refcount = 1;
2567         handle = 0;
2568         isProfilingQueue_ = false;
2569     }
2570
2571     Impl(cl_command_queue q)
2572     {
2573         __init();
2574         handle = q;
2575
2576         cl_command_queue_properties props = 0;
2577         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
2578         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
2579     }
2580
2581     Impl(cl_command_queue q, bool isProfilingQueue)
2582     {
2583         __init();
2584         handle = q;
2585         isProfilingQueue_ = isProfilingQueue;
2586     }
2587
2588     Impl(const Context& c, const Device& d, bool withProfiling = false)
2589     {
2590         __init();
2591
2592         const Context* pc = &c;
2593         cl_context ch = (cl_context)pc->ptr();
2594         if( !ch )
2595         {
2596             pc = &Context::getDefault();
2597             ch = (cl_context)pc->ptr();
2598         }
2599         cl_device_id dh = (cl_device_id)d.ptr();
2600         if( !dh )
2601             dh = (cl_device_id)pc->device(0).ptr();
2602         cl_int retval = 0;
2603         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
2604         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
2605         isProfilingQueue_ = withProfiling;
2606     }
2607
2608     ~Impl()
2609     {
2610 #ifdef _WIN32
2611         if (!cv::__termination)
2612 #endif
2613         {
2614             if(handle)
2615             {
2616                 CV_OCL_DBG_CHECK(clFinish(handle));
2617                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
2618                 handle = NULL;
2619             }
2620         }
2621     }
2622
2623     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
2624     {
2625         if (isProfilingQueue_)
2626             return self;
2627
2628         if (profiling_queue_.ptr())
2629             return profiling_queue_;
2630
2631         cl_context ctx = 0;
2632         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
2633
2634         cl_device_id device = 0;
2635         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
2636
2637         cl_int result = CL_SUCCESS;
2638         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
2639         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
2640         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
2641
2642         Queue queue;
2643         queue.p = new Impl(q, true);
2644         profiling_queue_ = queue;
2645
2646         return profiling_queue_;
2647     }
2648
2649     IMPLEMENT_REFCOUNTABLE();
2650
2651     cl_command_queue handle;
2652     bool isProfilingQueue_;
2653     cv::ocl::Queue profiling_queue_;
2654 };
2655
2656 Queue::Queue()
2657 {
2658     p = 0;
2659 }
2660
2661 Queue::Queue(const Context& c, const Device& d)
2662 {
2663     p = 0;
2664     create(c, d);
2665 }
2666
2667 Queue::Queue(const Queue& q)
2668 {
2669     p = q.p;
2670     if(p)
2671         p->addref();
2672 }
2673
2674 Queue& Queue::operator = (const Queue& q)
2675 {
2676     Impl* newp = (Impl*)q.p;
2677     if(newp)
2678         newp->addref();
2679     if(p)
2680         p->release();
2681     p = newp;
2682     return *this;
2683 }
2684
2685 Queue::~Queue()
2686 {
2687     if(p)
2688         p->release();
2689 }
2690
2691 bool Queue::create(const Context& c, const Device& d)
2692 {
2693     if(p)
2694         p->release();
2695     p = new Impl(c, d);
2696     return p->handle != 0;
2697 }
2698
2699 void Queue::finish()
2700 {
2701     if(p && p->handle)
2702     {
2703         CV_OCL_DBG_CHECK(clFinish(p->handle));
2704     }
2705 }
2706
2707 const Queue& Queue::getProfilingQueue() const
2708 {
2709     CV_Assert(p);
2710     return p->getProfilingQueue(*this);
2711 }
2712
2713 void* Queue::ptr() const
2714 {
2715     return p ? p->handle : 0;
2716 }
2717
2718 Queue& Queue::getDefault()
2719 {
2720     Queue& q = getCoreTlsData().oclQueue;
2721     if( !q.p && haveOpenCL() )
2722         q.create(Context::getDefault());
2723     return q;
2724 }
2725
2726 static cl_command_queue getQueue(const Queue& q)
2727 {
2728     cl_command_queue qq = (cl_command_queue)q.ptr();
2729     if(!qq)
2730         qq = (cl_command_queue)Queue::getDefault().ptr();
2731     return qq;
2732 }
2733
2734 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2735
2736 KernelArg::KernelArg()
2737     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2738 {
2739 }
2740
2741 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2742     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2743 {
2744     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
2745 }
2746
2747 KernelArg KernelArg::Constant(const Mat& m)
2748 {
2749     CV_Assert(m.isContinuous());
2750     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
2751 }
2752
2753 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2754
2755 struct Kernel::Impl
2756 {
2757     Impl(const char* kname, const Program& prog) :
2758         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
2759     {
2760         cl_program ph = (cl_program)prog.ptr();
2761         cl_int retval = 0;
2762         name = kname;
2763         if (ph)
2764         {
2765             handle = clCreateKernel(ph, kname, &retval);
2766             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
2767         }
2768         for( int i = 0; i < MAX_ARRS; i++ )
2769             u[i] = 0;
2770         haveTempDstUMats = false;
2771         haveTempSrcUMats = false;
2772     }
2773
2774     void cleanupUMats()
2775     {
2776         for( int i = 0; i < MAX_ARRS; i++ )
2777             if( u[i] )
2778             {
2779                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2780                 {
2781                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
2782                     u[i]->currAllocator->deallocate(u[i]);
2783                 }
2784                 u[i] = 0;
2785             }
2786         nu = 0;
2787         haveTempDstUMats = false;
2788         haveTempSrcUMats = false;
2789     }
2790
2791     void addUMat(const UMat& m, bool dst)
2792     {
2793         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2794         u[nu] = m.u;
2795         CV_XADD(&m.u->urefcount, 1);
2796         nu++;
2797         if(dst && m.u->tempUMat())
2798             haveTempDstUMats = true;
2799         if(m.u->originalUMatData == NULL && m.u->tempUMat())
2800             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
2801     }
2802
2803     void addImage(const Image2D& image)
2804     {
2805         images.push_back(image);
2806     }
2807
2808     void finit(cl_event e)
2809     {
2810         CV_UNUSED(e);
2811         cleanupUMats();
2812         images.clear();
2813         isInProgress = false;
2814         release();
2815     }
2816
2817     bool run(int dims, size_t _globalsize[], size_t _localsize[],
2818             bool sync, int64* timeNS, const Queue& q);
2819
2820     ~Impl()
2821     {
2822         if(handle)
2823         {
2824             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
2825         }
2826     }
2827
2828     IMPLEMENT_REFCOUNTABLE();
2829
2830     cv::String name;
2831     cl_kernel handle;
2832     enum { MAX_ARRS = 16 };
2833     UMatData* u[MAX_ARRS];
2834     bool isInProgress;
2835     bool isAsyncRun;  // true if kernel was scheduled in async mode
2836     int nu;
2837     std::list<Image2D> images;
2838     bool haveTempDstUMats;
2839     bool haveTempSrcUMats;
2840 };
2841
2842 }} // namespace cv::ocl
2843
2844 extern "C" {
2845
2846 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
2847 {
2848     try
2849     {
2850         ((cv::ocl::Kernel::Impl*)p)->finit(e);
2851     }
2852     catch (const cv::Exception& exc)
2853     {
2854         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
2855     }
2856     catch (const std::exception& exc)
2857     {
2858         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
2859     }
2860     catch (...)
2861     {
2862         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
2863     }
2864 }
2865
2866 }
2867
2868 namespace cv { namespace ocl {
2869
2870 Kernel::Kernel()
2871 {
2872     p = 0;
2873 }
2874
2875 Kernel::Kernel(const char* kname, const Program& prog)
2876 {
2877     p = 0;
2878     create(kname, prog);
2879 }
2880
2881 Kernel::Kernel(const char* kname, const ProgramSource& src,
2882                const String& buildopts, String* errmsg)
2883 {
2884     p = 0;
2885     create(kname, src, buildopts, errmsg);
2886 }
2887
2888 Kernel::Kernel(const Kernel& k)
2889 {
2890     p = k.p;
2891     if(p)
2892         p->addref();
2893 }
2894
2895 Kernel& Kernel::operator = (const Kernel& k)
2896 {
2897     Impl* newp = (Impl*)k.p;
2898     if(newp)
2899         newp->addref();
2900     if(p)
2901         p->release();
2902     p = newp;
2903     return *this;
2904 }
2905
2906 Kernel::~Kernel()
2907 {
2908     if(p)
2909         p->release();
2910 }
2911
2912 bool Kernel::create(const char* kname, const Program& prog)
2913 {
2914     if(p)
2915         p->release();
2916     p = new Impl(kname, prog);
2917     if(p->handle == 0)
2918     {
2919         p->release();
2920         p = 0;
2921     }
2922 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
2923     CV_Assert(p);
2924 #endif
2925     return p != 0;
2926 }
2927
2928 bool Kernel::create(const char* kname, const ProgramSource& src,
2929                     const String& buildopts, String* errmsg)
2930 {
2931     if(p)
2932     {
2933         p->release();
2934         p = 0;
2935     }
2936     String tempmsg;
2937     if( !errmsg ) errmsg = &tempmsg;
2938     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2939     return create(kname, prog);
2940 }
2941
2942 void* Kernel::ptr() const
2943 {
2944     return p ? p->handle : 0;
2945 }
2946
2947 bool Kernel::empty() const
2948 {
2949     return ptr() == 0;
2950 }
2951
2952 int Kernel::set(int i, const void* value, size_t sz)
2953 {
2954     if (!p || !p->handle)
2955         return -1;
2956     if (i < 0)
2957         return i;
2958     if( i == 0 )
2959         p->cleanupUMats();
2960
2961     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2962     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
2963     if (retval != CL_SUCCESS)
2964         return -1;
2965     return i+1;
2966 }
2967
2968 int Kernel::set(int i, const Image2D& image2D)
2969 {
2970     p->addImage(image2D);
2971     cl_mem h = (cl_mem)image2D.ptr();
2972     return set(i, &h, sizeof(h));
2973 }
2974
2975 int Kernel::set(int i, const UMat& m)
2976 {
2977     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
2978 }
2979
2980 int Kernel::set(int i, const KernelArg& arg)
2981 {
2982     if( !p || !p->handle )
2983         return -1;
2984     if (i < 0)
2985     {
2986         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
2987                 p->name.c_str(), (int)i));
2988         return i;
2989     }
2990     if( i == 0 )
2991         p->cleanupUMats();
2992     cl_int status = 0;
2993     if( arg.m )
2994     {
2995         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
2996                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
2997         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
2998         if (ptronly && arg.m->empty())
2999         {
3000             cl_mem h_null = (cl_mem)NULL;
3001             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3002             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3003             return i + 1;
3004         }
3005         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3006
3007         if (!h)
3008         {
3009             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3010                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3011             p->release();
3012             p = 0;
3013             return -1;
3014         }
3015
3016 #ifdef HAVE_OPENCL_SVM
3017         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3018         {
3019             const Context& ctx = Context::getDefault();
3020             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3021             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3022             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3023 #if 1 // TODO
3024             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3025 #else
3026             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3027 #endif
3028             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3029         }
3030         else
3031 #endif
3032         {
3033             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3034             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3035         }
3036
3037         if (ptronly)
3038         {
3039             i++;
3040         }
3041         else if( arg.m->dims <= 2 )
3042         {
3043             UMat2D u2d(*arg.m);
3044             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3045             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3046             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3047             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3048             i += 3;
3049
3050             if( !(arg.flags & KernelArg::NO_SIZE) )
3051             {
3052                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3053                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3054                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3055                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3056                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3057                 i += 2;
3058             }
3059         }
3060         else
3061         {
3062             UMat3D u3d(*arg.m);
3063             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3064             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3065             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3066             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3067             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3068             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3069             i += 4;
3070             if( !(arg.flags & KernelArg::NO_SIZE) )
3071             {
3072                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3073                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3074                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3075                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3076                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3077                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3078                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3079                 i += 3;
3080             }
3081         }
3082         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3083         return i;
3084     }
3085     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3086     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3087     return i+1;
3088 }
3089
3090 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3091                  bool sync, const Queue& q)
3092 {
3093     if (!p)
3094         return false;
3095
3096     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3097     size_t total = 1;
3098     CV_Assert(_globalsize != NULL);
3099     for (int i = 0; i < dims; i++)
3100     {
3101         size_t val = _localsize ? _localsize[i] :
3102             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3103         CV_Assert( val > 0 );
3104         total *= _globalsize[i];
3105         if (_globalsize[i] == 1 && !_localsize)
3106             val = 1;
3107         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3108     }
3109     CV_Assert(total > 0);
3110
3111     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3112 }
3113
3114
3115 static bool isRaiseErrorOnReuseAsyncKernel()
3116 {
3117     static bool initialized = false;
3118     static bool value = false;
3119     if (!initialized)
3120     {
3121         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3122         initialized = true;
3123     }
3124     return value;
3125 }
3126
3127 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3128         bool sync, int64* timeNS, const Queue& q)
3129 {
3130     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3131
3132     if (!handle)
3133     {
3134         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3135         return false;
3136     }
3137
3138     if (isAsyncRun)
3139     {
3140         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3141         if (isRaiseErrorOnReuseAsyncKernel())
3142             CV_Assert(0);
3143         return false;  // OpenCV 5.0: raise error
3144     }
3145     isAsyncRun = !sync;
3146
3147     if (isInProgress)
3148     {
3149         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3150         if (isRaiseErrorOnReuseAsyncKernel())
3151             CV_Assert(0);
3152         return false;  // OpenCV 5.0: raise error
3153     }
3154
3155     cl_command_queue qq = getQueue(q);
3156     if (haveTempDstUMats)
3157         sync = true;
3158     if (haveTempSrcUMats)
3159         sync = true;
3160     if (timeNS)
3161         sync = true;
3162     cl_event asyncEvent = 0;
3163     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3164                                            NULL, globalsize, localsize, 0, 0,
3165                                            (sync && !timeNS) ? 0 : &asyncEvent);
3166 #if !CV_OPENCL_SHOW_RUN_KERNELS
3167     if (retval != CL_SUCCESS)
3168 #endif
3169     {
3170         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%dx%dx%d, localsize=%s) sync=%s", name.c_str(), (int)dims,
3171                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3172                         (localsize ? cv::format("%dx%dx%d", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3173                         sync ? "true" : "false"
3174                         );
3175         if (retval != CL_SUCCESS)
3176         {
3177             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3178         }
3179 #if CV_OPENCL_TRACE_CHECK
3180         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3181 #else
3182         printf("%s\n", msg.c_str());
3183         fflush(stdout);
3184 #endif
3185     }
3186     if (sync || retval != CL_SUCCESS)
3187     {
3188         CV_OCL_DBG_CHECK(clFinish(qq));
3189         if (timeNS)
3190         {
3191             if (retval == CL_SUCCESS)
3192             {
3193                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3194                 cl_ulong startTime, stopTime;
3195                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3196                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3197                 *timeNS = (int64)(stopTime - startTime);
3198             }
3199             else
3200             {
3201                 *timeNS = -1;
3202             }
3203         }
3204         cleanupUMats();
3205     }
3206     else
3207     {
3208         addref();
3209         isInProgress = true;
3210         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3211     }
3212     if (asyncEvent)
3213         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3214     return retval == CL_SUCCESS;
3215 }
3216
3217 bool Kernel::runTask(bool sync, const Queue& q)
3218 {
3219     if(!p || !p->handle || p->isInProgress)
3220         return false;
3221
3222     cl_command_queue qq = getQueue(q);
3223     cl_event asyncEvent = 0;
3224     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3225     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3226     if (sync || retval != CL_SUCCESS)
3227     {
3228         CV_OCL_DBG_CHECK(clFinish(qq));
3229         p->cleanupUMats();
3230     }
3231     else
3232     {
3233         p->addref();
3234         p->isInProgress = true;
3235         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3236     }
3237     if (asyncEvent)
3238         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3239     return retval == CL_SUCCESS;
3240 }
3241
3242 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3243 {
3244     CV_Assert(p && p->handle && !p->isInProgress);
3245     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3246     CV_Assert(q.ptr());
3247     q.finish(); // call clFinish() on base queue
3248     Queue profilingQueue = q.getProfilingQueue();
3249     int64 timeNs = -1;
3250     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3251     return res ? timeNs : -1;
3252 }
3253
3254 size_t Kernel::workGroupSize() const
3255 {
3256     if(!p || !p->handle)
3257         return 0;
3258     size_t val = 0, retsz = 0;
3259     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3260     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3261     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3262     return status == CL_SUCCESS ? val : 0;
3263 }
3264
3265 size_t Kernel::preferedWorkGroupSizeMultiple() const
3266 {
3267     if(!p || !p->handle)
3268         return 0;
3269     size_t val = 0, retsz = 0;
3270     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3271     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3272     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3273     return status == CL_SUCCESS ? val : 0;
3274 }
3275
3276 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3277 {
3278     if(!p || !p->handle || !wsz)
3279         return 0;
3280     size_t retsz = 0;
3281     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3282     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3283     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3284     return status == CL_SUCCESS;
3285 }
3286
3287 size_t Kernel::localMemSize() const
3288 {
3289     if(!p || !p->handle)
3290         return 0;
3291     size_t retsz = 0;
3292     cl_ulong val = 0;
3293     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3294     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3295     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3296     return status == CL_SUCCESS ? (size_t)val : 0;
3297 }
3298
3299
3300
3301 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3302
3303 struct ProgramSource::Impl
3304 {
3305     IMPLEMENT_REFCOUNTABLE();
3306
3307     enum KIND {
3308         PROGRAM_SOURCE_CODE = 0,
3309         PROGRAM_BINARIES,
3310         PROGRAM_SPIR,
3311         PROGRAM_SPIRV
3312     } kind_;
3313
3314     Impl(const String& src)
3315     {
3316         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3317         initFromSource(src, cv::String());
3318     }
3319     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3320     {
3321         init(PROGRAM_SOURCE_CODE, module, name);
3322         initFromSource(codeStr, codeHash);
3323     }
3324
3325     /// reset fields
3326     void init(enum KIND kind, const String& module, const String& name)
3327     {
3328         refcount = 1;
3329         kind_ = kind;
3330         module_ = module;
3331         name_ = name;
3332
3333         sourceAddr_ = NULL;
3334         sourceSize_ = 0;
3335         isHashUpdated = false;
3336     }
3337
3338     void initFromSource(const String& codeStr, const String& codeHash)
3339     {
3340         codeStr_ = codeStr;
3341         sourceHash_ = codeHash;
3342         if (sourceHash_.empty())
3343         {
3344             updateHash();
3345         }
3346         else
3347         {
3348             isHashUpdated = true;
3349         }
3350     }
3351
3352     void updateHash(const char* hashStr = NULL)
3353     {
3354         if (hashStr)
3355         {
3356             sourceHash_ = cv::String(hashStr);
3357             isHashUpdated = true;
3358             return;
3359         }
3360         uint64 hash = 0;
3361         switch (kind_)
3362         {
3363         case PROGRAM_SOURCE_CODE:
3364             if (sourceAddr_)
3365             {
3366                 CV_Assert(codeStr_.empty());
3367                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3368             }
3369             else
3370             {
3371                 CV_Assert(!codeStr_.empty());
3372                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3373             }
3374             break;
3375         case PROGRAM_BINARIES:
3376         case PROGRAM_SPIR:
3377         case PROGRAM_SPIRV:
3378             hash = crc64(sourceAddr_, sourceSize_);
3379             break;
3380         default:
3381             CV_Error(Error::StsInternal, "Internal error");
3382         }
3383         sourceHash_ = cv::format("%08llx", hash);
3384         isHashUpdated = true;
3385     }
3386
3387     Impl(enum KIND kind,
3388             const String& module, const String& name,
3389             const unsigned char* binary, const size_t size,
3390             const cv::String& buildOptions = cv::String())
3391     {
3392         init(kind, module, name);
3393
3394         sourceAddr_ = binary;
3395         sourceSize_ = size;
3396
3397         buildOptions_ = buildOptions;
3398     }
3399
3400     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3401             const char* sourceCodeStaticStr, const char* hashStaticStr,
3402             const cv::String& buildOptions)
3403     {
3404         ProgramSource result;
3405         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3406                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3407         result.p->updateHash(hashStaticStr);
3408         return result;
3409     }
3410
3411     static ProgramSource fromBinary(const String& module, const String& name,
3412             const unsigned char* binary, const size_t size,
3413             const cv::String& buildOptions)
3414     {
3415         ProgramSource result;
3416         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3417         return result;
3418     }
3419
3420     static ProgramSource fromSPIR(const String& module, const String& name,
3421             const unsigned char* binary, const size_t size,
3422             const cv::String& buildOptions)
3423     {
3424         ProgramSource result;
3425         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3426         return result;
3427     }
3428
3429     String module_;
3430     String name_;
3431
3432     // TODO std::vector<ProgramSource> includes_;
3433     String codeStr_; // PROGRAM_SOURCE_CODE only
3434
3435     const unsigned char* sourceAddr_;
3436     size_t sourceSize_;
3437
3438     cv::String buildOptions_;
3439
3440     String sourceHash_;
3441     bool isHashUpdated;
3442
3443     friend struct Program::Impl;
3444     friend struct internal::ProgramEntry;
3445     friend struct Context::Impl;
3446 };
3447
3448
3449 ProgramSource::ProgramSource()
3450 {
3451     p = 0;
3452 }
3453
3454 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
3455 {
3456     p = new Impl(module, name, codeStr, codeHash);
3457 }
3458
3459 ProgramSource::ProgramSource(const char* prog)
3460 {
3461     p = new Impl(prog);
3462 }
3463
3464 ProgramSource::ProgramSource(const String& prog)
3465 {
3466     p = new Impl(prog);
3467 }
3468
3469 ProgramSource::~ProgramSource()
3470 {
3471     if(p)
3472         p->release();
3473 }
3474
3475 ProgramSource::ProgramSource(const ProgramSource& prog)
3476 {
3477     p = prog.p;
3478     if(p)
3479         p->addref();
3480 }
3481
3482 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3483 {
3484     Impl* newp = (Impl*)prog.p;
3485     if(newp)
3486         newp->addref();
3487     if(p)
3488         p->release();
3489     p = newp;
3490     return *this;
3491 }
3492
3493 const String& ProgramSource::source() const
3494 {
3495     CV_Assert(p);
3496     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
3497     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
3498     return p->codeStr_;
3499 }
3500
3501 ProgramSource::hash_t ProgramSource::hash() const
3502 {
3503     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
3504 }
3505
3506 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
3507         const unsigned char* binary, const size_t size,
3508         const cv::String& buildOptions)
3509 {
3510     CV_Assert(binary);
3511     CV_Assert(size > 0);
3512     return Impl::fromBinary(module, name, binary, size, buildOptions);
3513 }
3514
3515 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
3516         const unsigned char* binary, const size_t size,
3517         const cv::String& buildOptions)
3518 {
3519     CV_Assert(binary);
3520     CV_Assert(size > 0);
3521     return Impl::fromBinary(module, name, binary, size, buildOptions);
3522 }
3523
3524
3525 internal::ProgramEntry::operator ProgramSource&() const
3526 {
3527     if (this->pProgramSource == NULL)
3528     {
3529         cv::AutoLock lock(cv::getInitializationMutex());
3530         if (this->pProgramSource == NULL)
3531         {
3532             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
3533             ProgramSource* ptr = new ProgramSource(ps);
3534             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
3535         }
3536     }
3537     return *this->pProgramSource;
3538 }
3539
3540
3541
3542 /////////////////////////////////////////// Program /////////////////////////////////////////////
3543
3544 static
3545 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
3546 {
3547     if (b.empty())
3548         return a;
3549     if (a.empty())
3550         return b;
3551     if (b[0] == ' ')
3552         return a + b;
3553     return a + (cv::String(" ") + b);
3554 }
3555
3556 struct Program::Impl
3557 {
3558     IMPLEMENT_REFCOUNTABLE();
3559
3560     Impl(const ProgramSource& src,
3561          const String& _buildflags, String& errmsg) :
3562          refcount(1),
3563          handle(NULL),
3564          buildflags(_buildflags)
3565     {
3566         const ProgramSource::Impl* src_ = src.getImpl();
3567         CV_Assert(src_);
3568         sourceModule_ = src_->module_;
3569         sourceName_ = src_->name_;
3570         const Context ctx = Context::getDefault();
3571         Device device = ctx.device(0);
3572         if (ctx.ptr() == NULL || device.ptr() == NULL)
3573             return;
3574         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
3575         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3576         {
3577             if (device.isAMD())
3578                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
3579             else if (device.isIntel())
3580                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
3581             const String param_buildExtraOptions = getBuildExtraOptions();
3582             if (!param_buildExtraOptions.empty())
3583                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
3584         }
3585         compile(ctx, src_, errmsg);
3586     }
3587
3588     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3589     {
3590         CV_Assert(ctx.getImpl());
3591         CV_Assert(src_);
3592
3593         // We don't cache OpenCL binaries
3594         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
3595         {
3596             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3597             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3598             return isLoaded;
3599         }
3600         return compileWithCache(ctx, src_, errmsg);
3601     }
3602
3603     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3604     {
3605         CV_Assert(ctx.getImpl());
3606         CV_Assert(src_);
3607         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
3608
3609 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3610         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
3611         const std::string base_dir = config.prepareCacheDirectoryForContext(
3612                 ctx.getImpl()->getPrefixString(),
3613                 ctx.getImpl()->getPrefixBase()
3614         );
3615         const String& hash_str = src_->sourceHash_;
3616         cv::String fname;
3617         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
3618         {
3619             CV_Assert(!hash_str.empty());
3620             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
3621             fname = utils::fs::join(base_dir, fname);
3622         }
3623         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
3624         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
3625         {
3626             try
3627             {
3628                 std::vector<char> binaryBuf;
3629                 bool res = false;
3630                 {
3631                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3632                     BinaryProgramFile file(fname, hash_str.c_str());
3633                     res = file.read(buildflags, binaryBuf);
3634                 }
3635                 if (res)
3636                 {
3637                     CV_Assert(!binaryBuf.empty());
3638                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
3639                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
3640                     if (isLoaded)
3641                         return true;
3642                 }
3643             }
3644             catch (const cv::Exception& e)
3645             {
3646                 CV_UNUSED(e);
3647                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
3648             }
3649             catch (...)
3650             {
3651                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
3652             }
3653         }
3654 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3655         CV_Assert(handle == NULL);
3656         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3657         {
3658             if (!buildFromSources(ctx, src_, errmsg))
3659             {
3660                 return false;
3661             }
3662         }
3663         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
3664         {
3665             buildflags = joinBuildOptions(buildflags, " -x spir");
3666             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
3667             {
3668                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
3669             }
3670             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3671             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3672             if (!isLoaded)
3673                 return false;
3674         }
3675         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
3676         {
3677             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
3678         }
3679         else
3680         {
3681             CV_Error(Error::StsInternal, "Internal error");
3682         }
3683         CV_Assert(handle != NULL);
3684 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3685         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
3686         {
3687             try
3688             {
3689                 std::vector<char> binaryBuf;
3690                 getProgramBinary(binaryBuf);
3691                 {
3692                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3693                     BinaryProgramFile file(fname, hash_str.c_str());
3694                     file.write(buildflags, binaryBuf);
3695                 }
3696             }
3697             catch (const cv::Exception& e)
3698             {
3699                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
3700             }
3701             catch (...)
3702             {
3703                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
3704             }
3705         }
3706 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3707 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3708         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3709         {
3710             std::vector<char> binaryBuf;
3711             getProgramBinary(binaryBuf);
3712             if (!binaryBuf.empty())
3713             {
3714                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3715                 handle = NULL;
3716                 createFromBinary(ctx, binaryBuf, errmsg);
3717             }
3718         }
3719 #endif
3720         return handle != NULL;
3721     }
3722
3723     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
3724     {
3725         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
3726
3727         size_t retsz = 0;
3728         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3729                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3730         if (log_retval == CL_SUCCESS && retsz > 1)
3731         {
3732             buffer.resize(retsz + 16);
3733             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3734                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
3735             if (log_retval == CL_SUCCESS)
3736             {
3737                 if (retsz < buffer.size())
3738                     buffer[retsz] = 0;
3739                 else
3740                     buffer[buffer.size() - 1] = 0;
3741             }
3742             else
3743             {
3744                 buffer[0] = 0;
3745             }
3746         }
3747
3748         errmsg = String(buffer.data());
3749         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
3750                 sourceModule_.c_str(), sourceName_.c_str(),
3751                 result, getOpenCLErrorString(result),
3752                 buildflags.c_str(), errmsg.c_str());
3753         fflush(stdout);
3754     }
3755
3756     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3757     {
3758         CV_Assert(src_);
3759         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
3760         CV_Assert(handle == NULL);
3761         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
3762                 sourceModule_.c_str(), sourceName_.c_str(),
3763                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
3764
3765         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
3766
3767         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
3768         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
3769         CV_Assert(srcptr != NULL);
3770         CV_Assert(srclen > 0);
3771
3772         cl_int retval = 0;
3773
3774         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3775         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
3776         CV_Assert(handle || retval != CL_SUCCESS);
3777         if (handle && retval == CL_SUCCESS)
3778         {
3779             size_t n = ctx.ndevices();
3780             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
3781             cl_device_id* deviceList = deviceListBuf.data();
3782             for (size_t i = 0; i < n; i++)
3783             {
3784                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
3785             }
3786
3787             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
3788             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
3789 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3790             if (retval != CL_SUCCESS)
3791 #endif
3792             {
3793                 dumpBuildLog_(retval, deviceList, errmsg);
3794
3795                 // don't remove "retval != CL_SUCCESS" condition here:
3796                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
3797                 if (retval != CL_SUCCESS && handle)
3798                 {
3799                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3800                     handle = NULL;
3801                 }
3802             }
3803 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3804             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3805             {
3806                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
3807                 size_t retsz = 0;
3808                 char kernels_buffer[4096] = {0};
3809                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3810                 if (retsz < sizeof(kernels_buffer))
3811                     kernels_buffer[retsz] = 0;
3812                 else
3813                     kernels_buffer[0] = 0;
3814                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3815             }
3816 #endif
3817
3818         }
3819         return handle != NULL;
3820     }
3821
3822     void getProgramBinary(std::vector<char>& buf)
3823     {
3824         CV_Assert(handle);
3825         size_t sz = 0;
3826         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
3827         buf.resize(sz);
3828         uchar* ptr = (uchar*)&buf[0];
3829         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
3830     }
3831
3832     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
3833     {
3834         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
3835     }
3836
3837     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
3838     {
3839         CV_Assert(handle == NULL);
3840         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
3841         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
3842
3843         CV_Assert(binarySize > 0);
3844
3845         size_t ndevices = (int)ctx.ndevices();
3846         AutoBuffer<cl_device_id> devices_(ndevices);
3847         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
3848         AutoBuffer<size_t> binarySizes_(ndevices);
3849
3850         cl_device_id* devices = devices_.data();
3851         const uchar** binaryPtrs = binaryPtrs_.data();
3852         size_t* binarySizes = binarySizes_.data();
3853         for (size_t i = 0; i < ndevices; i++)
3854         {
3855             devices[i] = (cl_device_id)ctx.device(i).ptr();
3856             binaryPtrs[i] = binaryAddr;
3857             binarySizes[i] = binarySize;
3858         }
3859
3860         cl_int result = 0;
3861         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
3862                                            binarySizes, binaryPtrs, NULL, &result);
3863         if (result != CL_SUCCESS)
3864         {
3865             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
3866             if (handle)
3867             {
3868                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3869                 handle = NULL;
3870             }
3871         }
3872         if (!handle)
3873         {
3874             return false;
3875         }
3876         // call clBuildProgram()
3877         {
3878             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
3879             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
3880             if (result != CL_SUCCESS)
3881             {
3882                 dumpBuildLog_(result, devices, errmsg);
3883                 if (handle)
3884                 {
3885                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3886                     handle = NULL;
3887                 }
3888                 return false;
3889             }
3890         }
3891         // check build status
3892         {
3893             cl_build_status build_status = CL_BUILD_NONE;
3894             size_t retsz = 0;
3895             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
3896                     sizeof(build_status), &build_status, &retsz));
3897             if (result == CL_SUCCESS)
3898             {
3899                 if (build_status == CL_BUILD_SUCCESS)
3900                 {
3901                     return true;
3902                 }
3903                 else
3904                 {
3905                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
3906                     return false;
3907                 }
3908             }
3909             else
3910             {
3911                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
3912                 if (handle)
3913                 {
3914                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3915                     handle = NULL;
3916                 }
3917             }
3918         }
3919 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3920         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3921         {
3922             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
3923             size_t retsz = 0;
3924             char kernels_buffer[4096] = {0};
3925             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3926             if (retsz < sizeof(kernels_buffer))
3927                 kernels_buffer[retsz] = 0;
3928             else
3929                 kernels_buffer[0] = 0;
3930             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3931         }
3932 #endif
3933         return handle != NULL;
3934     }
3935
3936     ~Impl()
3937     {
3938         if( handle )
3939         {
3940 #ifdef _WIN32
3941             if (!cv::__termination)
3942 #endif
3943             {
3944                 clReleaseProgram(handle);
3945             }
3946             handle = NULL;
3947         }
3948     }
3949
3950     cl_program handle;
3951
3952     String buildflags;
3953     String sourceModule_;
3954     String sourceName_;
3955 };
3956
3957
3958 Program::Program() { p = 0; }
3959
3960 Program::Program(const ProgramSource& src,
3961         const String& buildflags, String& errmsg)
3962 {
3963     p = 0;
3964     create(src, buildflags, errmsg);
3965 }
3966
3967 Program::Program(const Program& prog)
3968 {
3969     p = prog.p;
3970     if(p)
3971         p->addref();
3972 }
3973
3974 Program& Program::operator = (const Program& prog)
3975 {
3976     Impl* newp = (Impl*)prog.p;
3977     if(newp)
3978         newp->addref();
3979     if(p)
3980         p->release();
3981     p = newp;
3982     return *this;
3983 }
3984
3985 Program::~Program()
3986 {
3987     if(p)
3988         p->release();
3989 }
3990
3991 bool Program::create(const ProgramSource& src,
3992             const String& buildflags, String& errmsg)
3993 {
3994     if(p)
3995     {
3996         p->release();
3997         p = NULL;
3998     }
3999     p = new Impl(src, buildflags, errmsg);
4000     if(!p->handle)
4001     {
4002         p->release();
4003         p = 0;
4004     }
4005     return p != 0;
4006 }
4007
4008 void* Program::ptr() const
4009 {
4010     return p ? p->handle : 0;
4011 }
4012
4013 #ifndef OPENCV_REMOVE_DEPRECATED_API
4014 const ProgramSource& Program::source() const
4015 {
4016     CV_Error(Error::StsNotImplemented, "Removed API");
4017 }
4018
4019 bool Program::read(const String& bin, const String& buildflags)
4020 {
4021     CV_UNUSED(bin); CV_UNUSED(buildflags);
4022     CV_Error(Error::StsNotImplemented, "Removed API");
4023 }
4024
4025 bool Program::write(String& bin) const
4026 {
4027     CV_UNUSED(bin);
4028     CV_Error(Error::StsNotImplemented, "Removed API");
4029 }
4030
4031 String Program::getPrefix() const
4032 {
4033     if(!p)
4034         return String();
4035     Context::Impl* ctx_ = Context::getDefault().getImpl();
4036     CV_Assert(ctx_);
4037     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4038 }
4039
4040 String Program::getPrefix(const String& buildflags)
4041 {
4042         Context::Impl* ctx_ = Context::getDefault().getImpl();
4043         CV_Assert(ctx_);
4044         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4045 }
4046 #endif // OPENCV_REMOVE_DEPRECATED_API
4047
4048 void Program::getBinary(std::vector<char>& binary) const
4049 {
4050     CV_Assert(p && "Empty program");
4051     p->getProgramBinary(binary);
4052 }
4053
4054 Program Context::Impl::getProg(const ProgramSource& src,
4055                                const String& buildflags, String& errmsg)
4056 {
4057     size_t limit = getProgramCountLimit();
4058     const ProgramSource::Impl* src_ = src.getImpl();
4059     CV_Assert(src_);
4060     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4061             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4062             getPrefixString().c_str(),
4063             buildflags.c_str());
4064     {
4065         cv::AutoLock lock(program_cache_mutex);
4066         phash_t::iterator it = phash.find(key);
4067         if (it != phash.end())
4068         {
4069             // TODO LRU cache
4070             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4071             if (i != cacheList.end() && i != cacheList.begin())
4072             {
4073                 cacheList.erase(i);
4074                 cacheList.push_front(key);
4075             }
4076             return it->second;
4077         }
4078         { // cleanup program cache
4079             size_t sz = phash.size();
4080             if (limit > 0 && sz >= limit)
4081             {
4082                 static bool warningFlag = false;
4083                 if (!warningFlag)
4084                 {
4085                     printf("\nWARNING: OpenCV-OpenCL:\n"
4086                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4087                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4088                     warningFlag = true;
4089                 }
4090                 while (!cacheList.empty())
4091                 {
4092                     size_t c = phash.erase(cacheList.back());
4093                     cacheList.pop_back();
4094                     if (c != 0)
4095                         break;
4096                 }
4097             }
4098         }
4099     }
4100     Program prog(src, buildflags, errmsg);
4101     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4102     {
4103         cv::AutoLock lock(program_cache_mutex);
4104         phash.insert(std::pair<std::string, Program>(key, prog));
4105         cacheList.push_front(key);
4106     }
4107     return prog;
4108 }
4109
4110
4111 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4112
4113 template<typename T>
4114 class OpenCLBufferPool
4115 {
4116 protected:
4117     ~OpenCLBufferPool() { }
4118 public:
4119     virtual T allocate(size_t size) = 0;
4120     virtual void release(T buffer) = 0;
4121 };
4122
4123 template <typename Derived, typename BufferEntry, typename T>
4124 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4125 {
4126 private:
4127     inline Derived& derived() { return *static_cast<Derived*>(this); }
4128 protected:
4129     Mutex mutex_;
4130
4131     size_t currentReservedSize;
4132     size_t maxReservedSize;
4133
4134     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4135     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4136
4137     // synchronized
4138     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4139     {
4140         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4141         for (; i != allocatedEntries_.end(); ++i)
4142         {
4143             BufferEntry& e = *i;
4144             if (e.clBuffer_ == buffer)
4145             {
4146                 entry = e;
4147                 allocatedEntries_.erase(i);
4148                 return true;
4149             }
4150         }
4151         return false;
4152     }
4153
4154     // synchronized
4155     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4156     {
4157         if (reservedEntries_.empty())
4158             return false;
4159         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4160         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4161         BufferEntry result;
4162         size_t minDiff = (size_t)(-1);
4163         for (; i != reservedEntries_.end(); ++i)
4164         {
4165             BufferEntry& e = *i;
4166             if (e.capacity_ >= size)
4167             {
4168                 size_t diff = e.capacity_ - size;
4169                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4170                 {
4171                     minDiff = diff;
4172                     result_pos = i;
4173                     result = e;
4174                     if (diff == 0)
4175                         break;
4176                 }
4177             }
4178         }
4179         if (result_pos != reservedEntries_.end())
4180         {
4181             //CV_DbgAssert(result == *result_pos);
4182             reservedEntries_.erase(result_pos);
4183             entry = result;
4184             currentReservedSize -= entry.capacity_;
4185             allocatedEntries_.push_back(entry);
4186             return true;
4187         }
4188         return false;
4189     }
4190
4191     // synchronized
4192     void _checkSizeOfReservedEntries()
4193     {
4194         while (currentReservedSize > maxReservedSize)
4195         {
4196             CV_DbgAssert(!reservedEntries_.empty());
4197             const BufferEntry& entry = reservedEntries_.back();
4198             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4199             currentReservedSize -= entry.capacity_;
4200             derived()._releaseBufferEntry(entry);
4201             reservedEntries_.pop_back();
4202         }
4203     }
4204
4205     inline size_t _allocationGranularity(size_t size)
4206     {
4207         // heuristic values
4208         if (size < 1024*1024)
4209             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4210         else if (size < 16*1024*1024)
4211             return 64*1024;
4212         else
4213             return 1024*1024;
4214     }
4215
4216 public:
4217     OpenCLBufferPoolBaseImpl()
4218         : currentReservedSize(0),
4219           maxReservedSize(0)
4220     {
4221         // nothing
4222     }
4223     virtual ~OpenCLBufferPoolBaseImpl()
4224     {
4225         freeAllReservedBuffers();
4226         CV_Assert(reservedEntries_.empty());
4227     }
4228 public:
4229     virtual T allocate(size_t size) CV_OVERRIDE
4230     {
4231         AutoLock locker(mutex_);
4232         BufferEntry entry;
4233         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4234         {
4235             CV_DbgAssert(size <= entry.capacity_);
4236             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4237         }
4238         else
4239         {
4240             derived()._allocateBufferEntry(entry, size);
4241         }
4242         return entry.clBuffer_;
4243     }
4244     virtual void release(T buffer) CV_OVERRIDE
4245     {
4246         AutoLock locker(mutex_);
4247         BufferEntry entry;
4248         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4249         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4250         {
4251             derived()._releaseBufferEntry(entry);
4252         }
4253         else
4254         {
4255             reservedEntries_.push_front(entry);
4256             currentReservedSize += entry.capacity_;
4257             _checkSizeOfReservedEntries();
4258         }
4259     }
4260
4261     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4262     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4263     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4264     {
4265         AutoLock locker(mutex_);
4266         size_t oldMaxReservedSize = maxReservedSize;
4267         maxReservedSize = size;
4268         if (maxReservedSize < oldMaxReservedSize)
4269         {
4270             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4271             for (; i != reservedEntries_.end();)
4272             {
4273                 const BufferEntry& entry = *i;
4274                 if (entry.capacity_ > maxReservedSize / 8)
4275                 {
4276                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4277                     currentReservedSize -= entry.capacity_;
4278                     derived()._releaseBufferEntry(entry);
4279                     i = reservedEntries_.erase(i);
4280                     continue;
4281                 }
4282                 ++i;
4283             }
4284             _checkSizeOfReservedEntries();
4285         }
4286     }
4287     virtual void freeAllReservedBuffers() CV_OVERRIDE
4288     {
4289         AutoLock locker(mutex_);
4290         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4291         for (; i != reservedEntries_.end(); ++i)
4292         {
4293             const BufferEntry& entry = *i;
4294             derived()._releaseBufferEntry(entry);
4295         }
4296         reservedEntries_.clear();
4297         currentReservedSize = 0;
4298     }
4299 };
4300
4301 struct CLBufferEntry
4302 {
4303     cl_mem clBuffer_;
4304     size_t capacity_;
4305     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4306 };
4307
4308 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4309 {
4310 public:
4311     typedef struct CLBufferEntry BufferEntry;
4312 protected:
4313     int createFlags_;
4314 public:
4315     OpenCLBufferPoolImpl(int createFlags = 0)
4316         : createFlags_(createFlags)
4317     {
4318     }
4319
4320     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4321     {
4322         CV_DbgAssert(entry.clBuffer_ == NULL);
4323         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4324         Context& ctx = Context::getDefault();
4325         cl_int retval = CL_SUCCESS;
4326         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4327         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4328         CV_Assert(entry.clBuffer_ != NULL);
4329         if(retval == CL_SUCCESS)
4330         {
4331             CV_IMPL_ADD(CV_IMPL_OCL);
4332         }
4333         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4334                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4335         allocatedEntries_.push_back(entry);
4336     }
4337
4338     void _releaseBufferEntry(const BufferEntry& entry)
4339     {
4340         CV_Assert(entry.capacity_ != 0);
4341         CV_Assert(entry.clBuffer_ != NULL);
4342         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4343                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4344         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4345     }
4346 };
4347
4348 #ifdef HAVE_OPENCL_SVM
4349 struct CLSVMBufferEntry
4350 {
4351     void* clBuffer_;
4352     size_t capacity_;
4353     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4354 };
4355 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4356 {
4357 public:
4358     typedef struct CLSVMBufferEntry BufferEntry;
4359 public:
4360     OpenCLSVMBufferPoolImpl()
4361     {
4362     }
4363
4364     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4365     {
4366         CV_DbgAssert(entry.clBuffer_ == NULL);
4367         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4368
4369         Context& ctx = Context::getDefault();
4370         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4371         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4372         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4373                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4374
4375         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4376         CV_DbgAssert(svmFns->isValid());
4377
4378         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4379         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4380         CV_Assert(buf);
4381
4382         entry.clBuffer_ = buf;
4383         {
4384             CV_IMPL_ADD(CV_IMPL_OCL);
4385         }
4386         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4387                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4388         allocatedEntries_.push_back(entry);
4389     }
4390
4391     void _releaseBufferEntry(const BufferEntry& entry)
4392     {
4393         CV_Assert(entry.capacity_ != 0);
4394         CV_Assert(entry.clBuffer_ != NULL);
4395         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4396                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4397         Context& ctx = Context::getDefault();
4398         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4399         CV_DbgAssert(svmFns->isValid());
4400         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4401         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4402     }
4403 };
4404 #endif
4405
4406
4407
4408 template <bool readAccess, bool writeAccess>
4409 class AlignedDataPtr
4410 {
4411 protected:
4412     const size_t size_;
4413     uchar* const originPtr_;
4414     const size_t alignment_;
4415     uchar* ptr_;
4416     uchar* allocatedPtr_;
4417
4418 public:
4419     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4420         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4421     {
4422         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4423         CV_DbgAssert(!readAccess || ptr);
4424         if (((size_t)ptr_ & (alignment - 1)) != 0)
4425         {
4426             allocatedPtr_ = new uchar[size_ + alignment - 1];
4427             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4428             if (readAccess)
4429             {
4430                 memcpy(ptr_, originPtr_, size_);
4431             }
4432         }
4433     }
4434
4435     uchar* getAlignedPtr() const
4436     {
4437         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4438         return ptr_;
4439     }
4440
4441     ~AlignedDataPtr()
4442     {
4443         if (allocatedPtr_)
4444         {
4445             if (writeAccess)
4446             {
4447                 memcpy(originPtr_, ptr_, size_);
4448             }
4449             delete[] allocatedPtr_;
4450             allocatedPtr_ = NULL;
4451         }
4452         ptr_ = NULL;
4453     }
4454 private:
4455     AlignedDataPtr(const AlignedDataPtr&); // disabled
4456     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4457 };
4458
4459 template <bool readAccess, bool writeAccess>
4460 class AlignedDataPtr2D
4461 {
4462 protected:
4463     const size_t size_;
4464     uchar* const originPtr_;
4465     const size_t alignment_;
4466     uchar* ptr_;
4467     uchar* allocatedPtr_;
4468     size_t rows_;
4469     size_t cols_;
4470     size_t step_;
4471
4472 public:
4473     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
4474         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
4475     {
4476         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4477         CV_DbgAssert(!readAccess || ptr != NULL);
4478         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
4479         {
4480             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
4481             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4482             if (readAccess)
4483             {
4484                 for (size_t i = 0; i < rows_; i++)
4485                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
4486             }
4487         }
4488     }
4489
4490     uchar* getAlignedPtr() const
4491     {
4492         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4493         return ptr_;
4494     }
4495
4496     ~AlignedDataPtr2D()
4497     {
4498         if (allocatedPtr_)
4499         {
4500             if (writeAccess)
4501             {
4502                 for (size_t i = 0; i < rows_; i++)
4503                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
4504             }
4505             delete[] allocatedPtr_;
4506             allocatedPtr_ = NULL;
4507         }
4508         ptr_ = NULL;
4509     }
4510 private:
4511     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
4512     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
4513 };
4514
4515 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4516 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4517 #endif
4518
4519 class OpenCLAllocator CV_FINAL : public MatAllocator
4520 {
4521     mutable OpenCLBufferPoolImpl bufferPool;
4522     mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4523 #ifdef  HAVE_OPENCL_SVM
4524     mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4525 #endif
4526
4527 public:
4528     enum AllocatorFlags
4529     {
4530         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4531         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
4532 #ifdef HAVE_OPENCL_SVM
4533         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
4534 #endif
4535         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
4536     };
4537
4538     OpenCLAllocator()
4539         : bufferPool(0),
4540           bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4541     {
4542         size_t defaultPoolSize, poolSize;
4543         defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4544         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4545         bufferPool.setMaxReservedSize(poolSize);
4546         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4547         bufferPoolHostPtr.setMaxReservedSize(poolSize);
4548 #ifdef HAVE_OPENCL_SVM
4549         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4550         bufferPoolSVM.setMaxReservedSize(poolSize);
4551 #endif
4552
4553         matStdAllocator = Mat::getDefaultAllocator();
4554     }
4555     ~OpenCLAllocator()
4556     {
4557         flushCleanupQueue();
4558     }
4559
4560     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4561             int flags, UMatUsageFlags usageFlags) const
4562     {
4563         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4564         return u;
4565     }
4566
4567     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
4568     {
4569         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
4570         return value;
4571     }
4572     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
4573     {
4574         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
4575         return value;
4576     }
4577
4578     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4579     {
4580         const Device& dev = ctx.device(0);
4581         createFlags = 0;
4582         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4583             createFlags |= CL_MEM_ALLOC_HOST_PTR;
4584
4585         if (!isOpenCLCopyingForced() &&
4586             (isOpenCLMapForced() ||
4587                 (dev.hostUnifiedMemory()
4588 #ifndef __APPLE__
4589                 || dev.isIntel()
4590 #endif
4591                 )
4592             )
4593         )
4594             flags0 = 0;
4595         else
4596             flags0 = UMatData::COPY_ON_MAP;
4597     }
4598
4599     UMatData* allocate(int dims, const int* sizes, int type,
4600                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4601     {
4602         if(!useOpenCL())
4603             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4604         CV_Assert(data == 0);
4605         size_t total = CV_ELEM_SIZE(type);
4606         for( int i = dims-1; i >= 0; i-- )
4607         {
4608             if( step )
4609                 step[i] = total;
4610             total *= sizes[i];
4611         }
4612
4613         Context& ctx = Context::getDefault();
4614         flushCleanupQueue();
4615
4616         int createFlags = 0, flags0 = 0;
4617         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4618
4619         void* handle = NULL;
4620         int allocatorFlags = 0;
4621
4622 #ifdef HAVE_OPENCL_SVM
4623         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4624         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4625         {
4626             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4627             handle = bufferPoolSVM.allocate(total);
4628
4629             // this property is constant, so single buffer pool can be used here
4630             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4631             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4632         }
4633         else
4634 #endif
4635         if (createFlags == 0)
4636         {
4637             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4638             handle = bufferPool.allocate(total);
4639         }
4640         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4641         {
4642             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4643             handle = bufferPoolHostPtr.allocate(total);
4644         }
4645         else
4646         {
4647             CV_Assert(handle != NULL); // Unsupported, throw
4648         }
4649
4650         if (!handle)
4651             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4652
4653         UMatData* u = new UMatData(this);
4654         u->data = 0;
4655         u->size = total;
4656         u->handle = handle;
4657         u->flags = flags0;
4658         u->allocatorFlags_ = allocatorFlags;
4659         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4660         u->markHostCopyObsolete(true);
4661         opencl_allocator_stats.onAllocate(u->size);
4662         return u;
4663     }
4664
4665     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4666     {
4667         if(!u)
4668             return false;
4669
4670         flushCleanupQueue();
4671
4672         UMatDataAutoLock lock(u);
4673
4674         if(u->handle == 0)
4675         {
4676             CV_Assert(u->origdata != 0);
4677             Context& ctx = Context::getDefault();
4678             int createFlags = 0, flags0 = 0;
4679             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4680
4681             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
4682
4683             cl_context ctx_handle = (cl_context)ctx.ptr();
4684             int allocatorFlags = 0;
4685             int tempUMatFlags = 0;
4686             void* handle = NULL;
4687             cl_int retval = CL_SUCCESS;
4688
4689 #ifdef HAVE_OPENCL_SVM
4690             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4691             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4692             if (useSVM && svmCaps.isSupportFineGrainSystem())
4693             {
4694                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4695                 tempUMatFlags = UMatData::TEMP_UMAT;
4696                 handle = u->origdata;
4697                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4698             }
4699             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4700             {
4701                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
4702                 {
4703                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4704
4705                     cl_svm_mem_flags memFlags = createFlags |
4706                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4707
4708                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4709                     CV_DbgAssert(svmFns->isValid());
4710
4711                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4712                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4713                     CV_Assert(handle);
4714
4715                     cl_command_queue q = NULL;
4716                     if (!isFineGrainBuffer)
4717                     {
4718                         q = (cl_command_queue)Queue::getDefault().ptr();
4719                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4720                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4721                                 handle, u->size,
4722                                 0, NULL, NULL);
4723                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4724
4725                     }
4726                     memcpy(handle, u->origdata, u->size);
4727                     if (!isFineGrainBuffer)
4728                     {
4729                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4730                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4731                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4732                     }
4733
4734                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4735                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4736                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4737                 }
4738             }
4739             else
4740 #endif
4741             {
4742                 if( copyOnMap )
4743                     accessFlags &= ~ACCESS_FAST;
4744
4745                 tempUMatFlags = UMatData::TEMP_UMAT;
4746                 if (
4747                 #ifdef __APPLE__
4748                     !copyOnMap &&
4749                 #endif
4750                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
4751                     // There are OpenCL runtime issues for less aligned data
4752                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
4753                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
4754                     // Avoid sharing of host memory between OpenCL buffers
4755                     && !(u->originalUMatData && u->originalUMatData->handle)
4756                 )
4757                 {
4758                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4759                                             u->size, u->origdata, &retval);
4760                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
4761                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4762                 }
4763                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4764                 {
4765                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4766                                                u->size, u->origdata, &retval);
4767                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
4768                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4769                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4770                 }
4771             }
4772             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
4773             if(!handle || retval != CL_SUCCESS)
4774                 return false;
4775             u->handle = handle;
4776             u->prevAllocator = u->currAllocator;
4777             u->currAllocator = this;
4778             u->flags |= tempUMatFlags | flags0;
4779             u->allocatorFlags_ = allocatorFlags;
4780         }
4781         if(accessFlags & ACCESS_WRITE)
4782             u->markHostCopyObsolete(true);
4783         opencl_allocator_stats.onAllocate(u->size);
4784         return true;
4785     }
4786
4787     /*void sync(UMatData* u) const
4788     {
4789         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4790         UMatDataAutoLock lock(u);
4791
4792         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4793         {
4794             if( u->tempCopiedUMat() )
4795             {
4796                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4797                                     u->size, u->origdata, 0, 0, 0);
4798             }
4799             else
4800             {
4801                 cl_int retval = 0;
4802                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4803                                                 (CL_MAP_READ | CL_MAP_WRITE),
4804                                                 0, u->size, 0, 0, 0, &retval);
4805                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4806                 clFinish(q);
4807             }
4808             u->markHostCopyObsolete(false);
4809         }
4810         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4811         {
4812             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4813                                  u->size, u->data, 0, 0, 0);
4814         }
4815     }*/
4816
4817     void deallocate(UMatData* u) const CV_OVERRIDE
4818     {
4819         if(!u)
4820             return;
4821
4822         CV_Assert(u->urefcount == 0);
4823         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
4824
4825         CV_Assert(u->handle != 0);
4826         CV_Assert(u->mapcount == 0);
4827
4828         if (u->flags & UMatData::ASYNC_CLEANUP)
4829             addToCleanupQueue(u);
4830         else
4831             deallocate_(u);
4832     }
4833
4834     void deallocate_(UMatData* u) const
4835     {
4836         CV_Assert(u);
4837         CV_Assert(u->handle);
4838         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
4839         {
4840             opencl_allocator_stats.onFree(u->size);
4841         }
4842
4843 #ifdef _WIN32
4844         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
4845             return;             // avoid any OpenCL calls
4846 #endif
4847         if(u->tempUMat())
4848         {
4849             CV_Assert(u->origdata);
4850 //            UMatDataAutoLock lock(u);
4851
4852             if (u->hostCopyObsolete())
4853             {
4854 #ifdef HAVE_OPENCL_SVM
4855                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4856                 {
4857                     Context& ctx = Context::getDefault();
4858                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4859                     CV_DbgAssert(svmFns->isValid());
4860
4861                     if( u->tempCopiedUMat() )
4862                     {
4863                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4864                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4865                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4866                         cl_command_queue q = NULL;
4867                         if (!isFineGrainBuffer)
4868                         {
4869                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4870                             q = (cl_command_queue)Queue::getDefault().ptr();
4871                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4872                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4873                                     u->handle, u->size,
4874                                     0, NULL, NULL);
4875                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4876                         }
4877                         clFinish(q);
4878                         memcpy(u->origdata, u->handle, u->size);
4879                         if (!isFineGrainBuffer)
4880                         {
4881                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4882                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4883                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4884                         }
4885                     }
4886                     else
4887                     {
4888                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4889                         // nothing
4890                     }
4891                 }
4892                 else
4893 #endif
4894                 {
4895                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4896                     if( u->tempCopiedUMat() )
4897                     {
4898                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4899                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4900                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
4901                     }
4902                     else
4903                     {
4904                         cl_int retval = 0;
4905                         if (u->tempUMat())
4906                         {
4907                             CV_Assert(u->mapcount == 0);
4908                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
4909                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4910                                 (CL_MAP_READ | CL_MAP_WRITE),
4911                                 0, u->size, 0, 0, 0, &retval);
4912                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
4913                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
4914                             if (u->originalUMatData)
4915                             {
4916                                 CV_Assert(u->originalUMatData->data == data);
4917                             }
4918                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4919                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
4920                             CV_OCL_DBG_CHECK(clFinish(q));
4921                         }
4922                     }
4923                 }
4924                 u->markHostCopyObsolete(false);
4925             }
4926             else
4927             {
4928                 // nothing
4929             }
4930 #ifdef HAVE_OPENCL_SVM
4931             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4932             {
4933                 if( u->tempCopiedUMat() )
4934                 {
4935                     Context& ctx = Context::getDefault();
4936                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4937                     CV_DbgAssert(svmFns->isValid());
4938
4939                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4940                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4941                 }
4942             }
4943             else
4944 #endif
4945             {
4946                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
4947                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
4948             }
4949             u->handle = 0;
4950             u->markDeviceCopyObsolete(true);
4951             u->currAllocator = u->prevAllocator;
4952             u->prevAllocator = NULL;
4953             if(u->data && u->copyOnMap() && u->data != u->origdata)
4954                 fastFree(u->data);
4955             u->data = u->origdata;
4956             u->currAllocator->deallocate(u);
4957             u = NULL;
4958         }
4959         else
4960         {
4961             CV_Assert(u->origdata == NULL);
4962             if(u->data && u->copyOnMap() && u->data != u->origdata)
4963             {
4964                 fastFree(u->data);
4965                 u->data = 0;
4966                 u->markHostCopyObsolete(true);
4967             }
4968             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4969             {
4970                 bufferPool.release((cl_mem)u->handle);
4971             }
4972             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4973             {
4974                 bufferPoolHostPtr.release((cl_mem)u->handle);
4975             }
4976 #ifdef HAVE_OPENCL_SVM
4977             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4978             {
4979                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4980                 {
4981                     //nothing
4982                 }
4983                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4984                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4985                 {
4986                     Context& ctx = Context::getDefault();
4987                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4988                     CV_DbgAssert(svmFns->isValid());
4989                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4990
4991                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
4992                     {
4993                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4994                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4995                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4996                     }
4997                 }
4998                 bufferPoolSVM.release((void*)u->handle);
4999             }
5000 #endif
5001             else
5002             {
5003                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5004             }
5005             u->handle = 0;
5006             u->markDeviceCopyObsolete(true);
5007             delete u;
5008             u = NULL;
5009         }
5010         CV_Assert(u == NULL);
5011     }
5012
5013     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5014     void map(UMatData* u, int accessFlags) const CV_OVERRIDE
5015     {
5016         CV_Assert(u && u->handle);
5017
5018         if(accessFlags & ACCESS_WRITE)
5019             u->markDeviceCopyObsolete(true);
5020
5021         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5022
5023         {
5024             if( !u->copyOnMap() )
5025             {
5026                 // TODO
5027                 // because there can be other map requests for the same UMat with different access flags,
5028                 // we use the universal (read-write) access mode.
5029 #ifdef HAVE_OPENCL_SVM
5030                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5031                 {
5032                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5033                     {
5034                         Context& ctx = Context::getDefault();
5035                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5036                         CV_DbgAssert(svmFns->isValid());
5037
5038                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5039                         {
5040                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5041                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5042                                     u->handle, u->size,
5043                                     0, NULL, NULL);
5044                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5045                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5046                         }
5047                     }
5048                     clFinish(q);
5049                     u->data = (uchar*)u->handle;
5050                     u->markHostCopyObsolete(false);
5051                     u->markDeviceMemMapped(true);
5052                     return;
5053                 }
5054 #endif
5055
5056                 cl_int retval = CL_SUCCESS;
5057                 if (!u->deviceMemMapped())
5058                 {
5059                     CV_Assert(u->refcount == 1);
5060                     CV_Assert(u->mapcount++ == 0);
5061                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5062                                                          (CL_MAP_READ | CL_MAP_WRITE),
5063                                                          0, u->size, 0, 0, 0, &retval);
5064                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5065                 }
5066                 if (u->data && retval == CL_SUCCESS)
5067                 {
5068                     u->markHostCopyObsolete(false);
5069                     u->markDeviceMemMapped(true);
5070                     return;
5071                 }
5072
5073                 // TODO Is it really a good idea and was it tested well?
5074                 // if map failed, switch to copy-on-map mode for the particular buffer
5075                 u->flags |= UMatData::COPY_ON_MAP;
5076             }
5077
5078             if(!u->data)
5079             {
5080                 u->data = (uchar*)fastMalloc(u->size);
5081                 u->markHostCopyObsolete(true);
5082             }
5083         }
5084
5085         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
5086         {
5087             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5088 #ifdef HAVE_OPENCL_SVM
5089             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5090 #endif
5091             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5092                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5093             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5094                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5095             u->markHostCopyObsolete(false);
5096         }
5097     }
5098
5099     void unmap(UMatData* u) const CV_OVERRIDE
5100     {
5101         if(!u)
5102             return;
5103
5104
5105         CV_Assert(u->handle != 0);
5106
5107         UMatDataAutoLock autolock(u);
5108
5109         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5110         cl_int retval = 0;
5111         if( !u->copyOnMap() && u->deviceMemMapped() )
5112         {
5113             CV_Assert(u->data != NULL);
5114 #ifdef HAVE_OPENCL_SVM
5115             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5116             {
5117                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5118                 {
5119                     Context& ctx = Context::getDefault();
5120                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5121                     CV_DbgAssert(svmFns->isValid());
5122
5123                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5124                     {
5125                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5126                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5127                                 0, NULL, NULL);
5128                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5129                         clFinish(q);
5130                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5131                     }
5132                 }
5133                 if (u->refcount == 0)
5134                     u->data = 0;
5135                 u->markDeviceCopyObsolete(false);
5136                 u->markHostCopyObsolete(true);
5137                 return;
5138             }
5139 #endif
5140             if (u->refcount == 0)
5141             {
5142                 CV_Assert(u->mapcount-- == 1);
5143                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5144                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5145                 if (Device::getDefault().isAMD())
5146                 {
5147                     // required for multithreaded applications (see stitching test)
5148                     CV_OCL_DBG_CHECK(clFinish(q));
5149                 }
5150                 u->markDeviceMemMapped(false);
5151                 u->data = 0;
5152                 u->markDeviceCopyObsolete(false);
5153                 u->markHostCopyObsolete(true);
5154             }
5155         }
5156         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5157         {
5158             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5159 #ifdef HAVE_OPENCL_SVM
5160             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5161 #endif
5162             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5163                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5164             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5165                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5166             u->markDeviceCopyObsolete(false);
5167             u->markHostCopyObsolete(true);
5168         }
5169     }
5170
5171     bool checkContinuous(int dims, const size_t sz[],
5172                          const size_t srcofs[], const size_t srcstep[],
5173                          const size_t dstofs[], const size_t dststep[],
5174                          size_t& total, size_t new_sz[],
5175                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5176                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5177     {
5178         bool iscontinuous = true;
5179         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5180         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5181         total = sz[dims-1];
5182         for( int i = dims-2; i >= 0; i-- )
5183         {
5184             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5185                 iscontinuous = false;
5186             total *= sz[i];
5187             if( srcofs )
5188                 srcrawofs += srcofs[i]*srcstep[i];
5189             if( dstofs )
5190                 dstrawofs += dstofs[i]*dststep[i];
5191         }
5192
5193         if( !iscontinuous )
5194         {
5195             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5196             if( dims == 2 )
5197             {
5198                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5199                 // we assume that new_... arrays are initialized by caller
5200                 // with 0's, so there is no else branch
5201                 if( srcofs )
5202                 {
5203                     new_srcofs[0] = srcofs[1];
5204                     new_srcofs[1] = srcofs[0];
5205                     new_srcofs[2] = 0;
5206                 }
5207
5208                 if( dstofs )
5209                 {
5210                     new_dstofs[0] = dstofs[1];
5211                     new_dstofs[1] = dstofs[0];
5212                     new_dstofs[2] = 0;
5213                 }
5214
5215                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5216                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5217             }
5218             else
5219             {
5220                 // we could check for dims == 3 here,
5221                 // but from user perspective this one is more informative
5222                 CV_Assert(dims <= 3);
5223                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5224                 if( srcofs )
5225                 {
5226                     new_srcofs[0] = srcofs[2];
5227                     new_srcofs[1] = srcofs[1];
5228                     new_srcofs[2] = srcofs[0];
5229                 }
5230
5231                 if( dstofs )
5232                 {
5233                     new_dstofs[0] = dstofs[2];
5234                     new_dstofs[1] = dstofs[1];
5235                     new_dstofs[2] = dstofs[0];
5236                 }
5237
5238                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5239                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5240             }
5241         }
5242         return iscontinuous;
5243     }
5244
5245     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5246                   const size_t srcofs[], const size_t srcstep[],
5247                   const size_t dststep[]) const CV_OVERRIDE
5248     {
5249         if(!u)
5250             return;
5251         UMatDataAutoLock autolock(u);
5252
5253         if( u->data && !u->hostCopyObsolete() )
5254         {
5255             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5256             return;
5257         }
5258         CV_Assert( u->handle != 0 );
5259
5260         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5261
5262         size_t total = 0, new_sz[] = {0, 0, 0};
5263         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5264         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5265
5266         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5267                                             total, new_sz,
5268                                             srcrawofs, new_srcofs, new_srcstep,
5269                                             dstrawofs, new_dstofs, new_dststep);
5270
5271 #ifdef HAVE_OPENCL_SVM
5272         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5273         {
5274             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5275             Context& ctx = Context::getDefault();
5276             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5277             CV_DbgAssert(svmFns->isValid());
5278
5279             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5280             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5281             {
5282                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5283                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5284                         u->handle, u->size,
5285                         0, NULL, NULL);
5286                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5287             }
5288             clFinish(q);
5289             if( iscontinuous )
5290             {
5291                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5292             }
5293             else
5294             {
5295                 // This code is from MatAllocator::download()
5296                 int isz[CV_MAX_DIM];
5297                 uchar* srcptr = (uchar*)u->handle;
5298                 for( int i = 0; i < dims; i++ )
5299                 {
5300                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5301                     if( sz[i] == 0 )
5302                     return;
5303                     if( srcofs )
5304                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5305                     isz[i] = (int)sz[i];
5306                 }
5307
5308                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5309                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5310
5311                 const Mat* arrays[] = { &src, &dst };
5312                 uchar* ptrs[2];
5313                 NAryMatIterator it(arrays, ptrs, 2);
5314                 size_t j, planesz = it.size;
5315
5316                 for( j = 0; j < it.nplanes; j++, ++it )
5317                     memcpy(ptrs[1], ptrs[0], planesz);
5318             }
5319             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5320             {
5321                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5322                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5323                         0, NULL, NULL);
5324                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5325                 clFinish(q);
5326             }
5327         }
5328         else
5329 #endif
5330         {
5331             if( iscontinuous )
5332             {
5333                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5334                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5335                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5336             }
5337             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5338             {
5339                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5340                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5341                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5342                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5343                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5344                 uchar* ptr = alignedPtr.getAlignedPtr();
5345
5346                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5347                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5348                 total = std::min(total, u->size - new_srcrawofs);
5349                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5350                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5351                 for( size_t i = 0; i < new_sz[1]; i++ )
5352                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5353             }
5354             else
5355             {
5356                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5357                 uchar* ptr = alignedPtr.getAlignedPtr();
5358
5359                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5360                     new_srcofs, new_dstofs, new_sz,
5361                     new_srcstep[0], 0,
5362                     new_dststep[0], 0,
5363                     ptr, 0, 0, 0));
5364             }
5365         }
5366     }
5367
5368     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5369                 const size_t dstofs[], const size_t dststep[],
5370                 const size_t srcstep[]) const CV_OVERRIDE
5371     {
5372         if(!u)
5373             return;
5374
5375         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5376         CV_Assert(u->refcount == 0 || u->tempUMat());
5377
5378         size_t total = 0, new_sz[] = {0, 0, 0};
5379         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5380         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5381
5382         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5383                                             total, new_sz,
5384                                             srcrawofs, new_srcofs, new_srcstep,
5385                                             dstrawofs, new_dstofs, new_dststep);
5386
5387         UMatDataAutoLock autolock(u);
5388
5389         // if there is cached CPU copy of the GPU matrix,
5390         // we could use it as a destination.
5391         // we can do it in 2 cases:
5392         //    1. we overwrite the whole content
5393         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5394         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5395         {
5396             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5397             u->markHostCopyObsolete(false);
5398             u->markDeviceCopyObsolete(true);
5399             return;
5400         }
5401
5402         CV_Assert( u->handle != 0 );
5403         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5404
5405 #ifdef HAVE_OPENCL_SVM
5406         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5407         {
5408             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5409             Context& ctx = Context::getDefault();
5410             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5411             CV_DbgAssert(svmFns->isValid());
5412
5413             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5414             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5415             {
5416                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5417                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5418                         u->handle, u->size,
5419                         0, NULL, NULL);
5420                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5421             }
5422             clFinish(q);
5423             if( iscontinuous )
5424             {
5425                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
5426             }
5427             else
5428             {
5429                 // This code is from MatAllocator::upload()
5430                 int isz[CV_MAX_DIM];
5431                 uchar* dstptr = (uchar*)u->handle;
5432                 for( int i = 0; i < dims; i++ )
5433                 {
5434                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5435                     if( sz[i] == 0 )
5436                     return;
5437                     if( dstofs )
5438                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5439                     isz[i] = (int)sz[i];
5440                 }
5441
5442                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
5443                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5444
5445                 const Mat* arrays[] = { &src, &dst };
5446                 uchar* ptrs[2];
5447                 NAryMatIterator it(arrays, ptrs, 2);
5448                 size_t j, planesz = it.size;
5449
5450                 for( j = 0; j < it.nplanes; j++, ++it )
5451                     memcpy(ptrs[1], ptrs[0], planesz);
5452             }
5453             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5454             {
5455                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5456                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5457                         0, NULL, NULL);
5458                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5459                 clFinish(q);
5460             }
5461         }
5462         else
5463 #endif
5464         {
5465             if( iscontinuous )
5466             {
5467                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5468                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5469                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
5470                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
5471                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5472             }
5473             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5474             {
5475                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5476                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5477                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
5478                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
5479                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5480                 uchar* ptr = alignedPtr.getAlignedPtr();
5481
5482                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5483                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
5484                 total = std::min(total, u->size - new_dstrawofs);
5485                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
5486                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
5487                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
5488                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5489                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5490                 for( size_t i = 0; i < new_sz[1]; i++ )
5491                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
5492                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5493                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5494             }
5495             else
5496             {
5497                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5498                 uchar* ptr = alignedPtr.getAlignedPtr();
5499
5500                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5501                     new_dstofs, new_srcofs, new_sz,
5502                     new_dststep[0], 0,
5503                     new_srcstep[0], 0,
5504                     ptr, 0, 0, 0));
5505             }
5506         }
5507         u->markHostCopyObsolete(true);
5508 #ifdef HAVE_OPENCL_SVM
5509         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5510                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5511         {
5512             // nothing
5513         }
5514         else
5515 #endif
5516         {
5517             u->markHostCopyObsolete(true);
5518         }
5519         u->markDeviceCopyObsolete(false);
5520     }
5521
5522     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
5523               const size_t srcofs[], const size_t srcstep[],
5524               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
5525     {
5526         if(!src || !dst)
5527             return;
5528
5529         size_t total = 0, new_sz[] = {0, 0, 0};
5530         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5531         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5532
5533         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
5534                                             total, new_sz,
5535                                             srcrawofs, new_srcofs, new_srcstep,
5536                                             dstrawofs, new_dstofs, new_dststep);
5537
5538         UMatDataAutoLock src_autolock(src, dst);
5539
5540         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5541         {
5542             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5543             return;
5544         }
5545         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5546         {
5547             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5548             dst->markHostCopyObsolete(false);
5549 #ifdef HAVE_OPENCL_SVM
5550             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5551                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5552             {
5553                 // nothing
5554             }
5555             else
5556 #endif
5557             {
5558                 dst->markDeviceCopyObsolete(true);
5559             }
5560             return;
5561         }
5562
5563         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5564         CV_Assert(dst->refcount == 0);
5565         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5566
5567         cl_int retval = CL_SUCCESS;
5568 #ifdef HAVE_OPENCL_SVM
5569         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5570                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5571         {
5572             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5573                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5574             {
5575                 Context& ctx = Context::getDefault();
5576                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5577                 CV_DbgAssert(svmFns->isValid());
5578
5579                 if( iscontinuous )
5580                 {
5581                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5582                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5583                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5584                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5585                             total, 0, NULL, NULL);
5586                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
5587                 }
5588                 else
5589                 {
5590                     clFinish(q);
5591                     // This code is from MatAllocator::download()/upload()
5592                     int isz[CV_MAX_DIM];
5593                     uchar* srcptr = (uchar*)src->handle;
5594                     for( int i = 0; i < dims; i++ )
5595                     {
5596                         CV_Assert( sz[i] <= (size_t)INT_MAX );
5597                         if( sz[i] == 0 )
5598                         return;
5599                         if( srcofs )
5600                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5601                         isz[i] = (int)sz[i];
5602                     }
5603                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5604
5605                     uchar* dstptr = (uchar*)dst->handle;
5606                     for( int i = 0; i < dims; i++ )
5607                     {
5608                         if( dstofs )
5609                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5610                     }
5611                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5612
5613                     const Mat* arrays[] = { &m_src, &m_dst };
5614                     uchar* ptrs[2];
5615                     NAryMatIterator it(arrays, ptrs, 2);
5616                     size_t j, planesz = it.size;
5617
5618                     for( j = 0; j < it.nplanes; j++, ++it )
5619                         memcpy(ptrs[1], ptrs[0], planesz);
5620                 }
5621             }
5622             else
5623             {
5624                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5625                 {
5626                     map(src, ACCESS_READ);
5627                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5628                     unmap(src);
5629                 }
5630                 else
5631                 {
5632                     map(dst, ACCESS_WRITE);
5633                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5634                     unmap(dst);
5635                 }
5636             }
5637         }
5638         else
5639 #endif
5640         {
5641             if( iscontinuous )
5642             {
5643                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5644                                                srcrawofs, dstrawofs, total, 0, 0, 0);
5645                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
5646                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
5647             }
5648             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5649             {
5650                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5651                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5652                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
5653                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5654                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
5655
5656                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5657                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5658                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
5659                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5660                 uchar* srcptr = srcBuf.getAlignedPtr();
5661                 uchar* dstptr = dstBuf.getAlignedPtr();
5662
5663                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5664
5665                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
5666                 src_total = std::min(src_total, src->size - new_srcrawofs);
5667                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
5668                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
5669
5670                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
5671                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
5672                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5673                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5674
5675                 for( size_t i = 0; i < new_sz[1]; i++ )
5676                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
5677                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
5678                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5679                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5680             }
5681             else
5682             {
5683                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5684                                                    new_srcofs, new_dstofs, new_sz,
5685                                                    new_srcstep[0], 0,
5686                                                    new_dststep[0], 0,
5687                                                    0, 0, 0));
5688             }
5689         }
5690         if (retval == CL_SUCCESS)
5691         {
5692             CV_IMPL_ADD(CV_IMPL_OCL)
5693         }
5694
5695 #ifdef HAVE_OPENCL_SVM
5696         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5697             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5698         {
5699             // nothing
5700         }
5701         else
5702 #endif
5703         {
5704             dst->markHostCopyObsolete(true);
5705         }
5706         dst->markDeviceCopyObsolete(false);
5707
5708         if( _sync )
5709         {
5710             CV_OCL_DBG_CHECK(clFinish(q));
5711         }
5712     }
5713
5714     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE {
5715 #ifdef HAVE_OPENCL_SVM
5716         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5717         {
5718             return &bufferPoolSVM;
5719         }
5720 #endif
5721         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5722         {
5723             return &bufferPoolHostPtr;
5724         }
5725         if (id != NULL && strcmp(id, "OCL") != 0)
5726         {
5727             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5728         }
5729         return &bufferPool;
5730     }
5731
5732     MatAllocator* matStdAllocator;
5733
5734     mutable cv::Mutex cleanupQueueMutex;
5735     mutable std::deque<UMatData*> cleanupQueue;
5736
5737     void flushCleanupQueue() const
5738     {
5739         if (!cleanupQueue.empty())
5740         {
5741             std::deque<UMatData*> q;
5742             {
5743                 cv::AutoLock lock(cleanupQueueMutex);
5744                 q.swap(cleanupQueue);
5745             }
5746             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
5747             {
5748                 deallocate_(*i);
5749             }
5750         }
5751     }
5752     void addToCleanupQueue(UMatData* u) const
5753     {
5754         //TODO: Validation check: CV_Assert(!u->tempUMat());
5755         {
5756             cv::AutoLock lock(cleanupQueueMutex);
5757             cleanupQueue.push_back(u);
5758         }
5759     }
5760 };
5761
5762 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
5763 {
5764     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
5765     g_isOpenCVActivated = true;
5766     return g_allocator;
5767 }
5768 MatAllocator* getOpenCLAllocator()
5769 {
5770     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
5771 }
5772
5773 }} // namespace cv::ocl
5774
5775
5776 namespace cv {
5777
5778 // three funcs below are implemented in umatrix.cpp
5779 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
5780               bool autoSteps = false );
5781 void finalizeHdr(UMat& m);
5782
5783 } // namespace cv
5784
5785
5786 namespace cv { namespace ocl {
5787
5788 /*
5789 // Convert OpenCL buffer memory to UMat
5790 */
5791 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
5792 {
5793     int d = 2;
5794     int sizes[] = { rows, cols };
5795
5796     CV_Assert(0 <= d && d <= CV_MAX_DIM);
5797
5798     dst.release();
5799
5800     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
5801     dst.usageFlags = USAGE_DEFAULT;
5802
5803     setSize(dst, d, sizes, 0, true);
5804     dst.offset = 0;
5805
5806     cl_mem             memobj = (cl_mem)cl_mem_buffer;
5807     cl_mem_object_type mem_type = 0;
5808
5809     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5810
5811     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
5812
5813     size_t total = 0;
5814     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
5815
5816     CV_OCL_CHECK(clRetainMemObject(memobj));
5817
5818     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
5819     CV_Assert(total >= rows * step);
5820
5821     // attach clBuffer to UMatData
5822     dst.u = new UMatData(getOpenCLAllocator());
5823     dst.u->data            = 0;
5824     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
5825     dst.u->flags           = 0;
5826     dst.u->handle          = cl_mem_buffer;
5827     dst.u->origdata        = 0;
5828     dst.u->prevAllocator   = 0;
5829     dst.u->size            = total;
5830
5831     finalizeHdr(dst);
5832     dst.addref();
5833
5834     return;
5835 } // convertFromBuffer()
5836
5837
5838 /*
5839 // Convert OpenCL image2d_t memory to UMat
5840 */
5841 void convertFromImage(void* cl_mem_image, UMat& dst)
5842 {
5843     cl_mem             clImage = (cl_mem)cl_mem_image;
5844     cl_mem_object_type mem_type = 0;
5845
5846     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5847
5848     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
5849
5850     cl_image_format fmt = { 0, 0 };
5851     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
5852
5853     int depth = CV_8U;
5854     switch (fmt.image_channel_data_type)
5855     {
5856     case CL_UNORM_INT8:
5857     case CL_UNSIGNED_INT8:
5858         depth = CV_8U;
5859         break;
5860
5861     case CL_SNORM_INT8:
5862     case CL_SIGNED_INT8:
5863         depth = CV_8S;
5864         break;
5865
5866     case CL_UNORM_INT16:
5867     case CL_UNSIGNED_INT16:
5868         depth = CV_16U;
5869         break;
5870
5871     case CL_SNORM_INT16:
5872     case CL_SIGNED_INT16:
5873         depth = CV_16S;
5874         break;
5875
5876     case CL_SIGNED_INT32:
5877         depth = CV_32S;
5878         break;
5879
5880     case CL_FLOAT:
5881         depth = CV_32F;
5882         break;
5883
5884     default:
5885         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
5886     }
5887
5888     int type = CV_8UC1;
5889     switch (fmt.image_channel_order)
5890     {
5891     case CL_R:
5892         type = CV_MAKE_TYPE(depth, 1);
5893         break;
5894
5895     case CL_RGBA:
5896     case CL_BGRA:
5897     case CL_ARGB:
5898         type = CV_MAKE_TYPE(depth, 4);
5899         break;
5900
5901     default:
5902         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
5903         break;
5904     }
5905
5906     size_t step = 0;
5907     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
5908
5909     size_t w = 0;
5910     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
5911
5912     size_t h = 0;
5913     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
5914
5915     dst.create((int)h, (int)w, type);
5916
5917     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
5918
5919     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5920
5921     size_t offset = 0;
5922     size_t src_origin[3] = { 0, 0, 0 };
5923     size_t region[3] = { w, h, 1 };
5924     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
5925
5926     CV_OCL_CHECK(clFinish(q));
5927
5928     return;
5929 } // convertFromImage()
5930
5931
5932 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5933
5934 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5935 {
5936     cl_uint numDevices = 0;
5937     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
5938     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
5939     {
5940         CV_OCL_DBG_CHECK_RESULT(status,
5941             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
5942     }
5943
5944     if (numDevices == 0)
5945     {
5946         devices.clear();
5947         return;
5948     }
5949
5950     devices.resize((size_t)numDevices);
5951     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
5952 }
5953
5954 struct PlatformInfo::Impl
5955 {
5956     Impl(void* id)
5957     {
5958         refcount = 1;
5959         handle = *(cl_platform_id*)id;
5960         getDevices(devices, handle);
5961     }
5962
5963     String getStrProp(cl_platform_info prop) const
5964     {
5965         char buf[1024];
5966         size_t sz=0;
5967         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5968             sz < sizeof(buf) ? String(buf) : String();
5969     }
5970
5971     IMPLEMENT_REFCOUNTABLE();
5972     std::vector<cl_device_id> devices;
5973     cl_platform_id handle;
5974 };
5975
5976 PlatformInfo::PlatformInfo()
5977 {
5978     p = 0;
5979 }
5980
5981 PlatformInfo::PlatformInfo(void* platform_id)
5982 {
5983     p = new Impl(platform_id);
5984 }
5985
5986 PlatformInfo::~PlatformInfo()
5987 {
5988     if(p)
5989         p->release();
5990 }
5991
5992 PlatformInfo::PlatformInfo(const PlatformInfo& i)
5993 {
5994     if (i.p)
5995         i.p->addref();
5996     p = i.p;
5997 }
5998
5999 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6000 {
6001     if (i.p != p)
6002     {
6003         if (i.p)
6004             i.p->addref();
6005         if (p)
6006             p->release();
6007         p = i.p;
6008     }
6009     return *this;
6010 }
6011
6012 int PlatformInfo::deviceNumber() const
6013 {
6014     return p ? (int)p->devices.size() : 0;
6015 }
6016
6017 void PlatformInfo::getDevice(Device& device, int d) const
6018 {
6019     CV_Assert(p && d < (int)p->devices.size() );
6020     if(p)
6021         device.set(p->devices[d]);
6022 }
6023
6024 String PlatformInfo::name() const
6025 {
6026     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6027 }
6028
6029 String PlatformInfo::vendor() const
6030 {
6031     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6032 }
6033
6034 String PlatformInfo::version() const
6035 {
6036     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6037 }
6038
6039 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6040 {
6041     cl_uint numPlatforms = 0;
6042     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6043
6044     if (numPlatforms == 0)
6045     {
6046         platforms.clear();
6047         return;
6048     }
6049
6050     platforms.resize((size_t)numPlatforms);
6051     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6052 }
6053
6054 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6055 {
6056     std::vector<cl_platform_id> platforms;
6057     getPlatforms(platforms);
6058
6059     for (size_t i = 0; i < platforms.size(); i++)
6060         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6061 }
6062
6063 const char* typeToStr(int type)
6064 {
6065     static const char* tab[]=
6066     {
6067         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6068         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6069         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6070         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6071         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6072         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6073         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6074         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6075     };
6076     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6077     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6078 }
6079
6080 const char* memopTypeToStr(int type)
6081 {
6082     static const char* tab[] =
6083     {
6084         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6085         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6086         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6087         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6088         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6089         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6090         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6091         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6092     };
6093     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6094     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6095 }
6096
6097 const char* vecopTypeToStr(int type)
6098 {
6099     static const char* tab[] =
6100     {
6101         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6102         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6103         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6104         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6105         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6106         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6107         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6108         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6109     };
6110     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6111     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6112 }
6113
6114 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6115 {
6116     if( sdepth == ddepth )
6117         return "noconvert";
6118     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6119     if( ddepth >= CV_32F ||
6120         (ddepth == CV_32S && sdepth < CV_32S) ||
6121         (ddepth == CV_16S && sdepth <= CV_8S) ||
6122         (ddepth == CV_16U && sdepth == CV_8U))
6123     {
6124         sprintf(buf, "convert_%s", typestr);
6125     }
6126     else if( sdepth >= CV_32F )
6127         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6128     else
6129         sprintf(buf, "convert_%s_sat", typestr);
6130
6131     return buf;
6132 }
6133
6134 const char* getOpenCLErrorString(int errorCode)
6135 {
6136 #define CV_OCL_CODE(id) case id: return #id
6137 #define CV_OCL_CODE_(id, name) case id: return #name
6138     switch (errorCode)
6139     {
6140     CV_OCL_CODE(CL_SUCCESS);
6141     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6142     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6143     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6144     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6145     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6146     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6147     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6148     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6149     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6150     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6151     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6152     CV_OCL_CODE(CL_MAP_FAILURE);
6153     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6154     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6155     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6156     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6157     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6158     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6159     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6160     CV_OCL_CODE(CL_INVALID_VALUE);
6161     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6162     CV_OCL_CODE(CL_INVALID_PLATFORM);
6163     CV_OCL_CODE(CL_INVALID_DEVICE);
6164     CV_OCL_CODE(CL_INVALID_CONTEXT);
6165     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6166     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6167     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6168     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6169     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6170     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6171     CV_OCL_CODE(CL_INVALID_SAMPLER);
6172     CV_OCL_CODE(CL_INVALID_BINARY);
6173     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6174     CV_OCL_CODE(CL_INVALID_PROGRAM);
6175     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6176     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6177     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6178     CV_OCL_CODE(CL_INVALID_KERNEL);
6179     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6180     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6181     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6182     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6183     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6184     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6185     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6186     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6187     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6188     CV_OCL_CODE(CL_INVALID_EVENT);
6189     CV_OCL_CODE(CL_INVALID_OPERATION);
6190     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6191     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6192     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6193     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6194     // OpenCL 1.1
6195     CV_OCL_CODE(CL_INVALID_PROPERTY);
6196     // OpenCL 1.2
6197     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6198     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6199     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6200     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6201     // OpenCL 2.0
6202     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6203     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6204     // Extensions
6205     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6206     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6207     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6208     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6209     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6210     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6211     default: return "Unknown OpenCL error";
6212     }
6213 #undef CV_OCL_CODE
6214 #undef CV_OCL_CODE_
6215 }
6216
6217 template <typename T>
6218 static std::string kerToStr(const Mat & k)
6219 {
6220     int width = k.cols - 1, depth = k.depth();
6221     const T * const data = k.ptr<T>();
6222
6223     std::ostringstream stream;
6224     stream.precision(10);
6225
6226     if (depth <= CV_8S)
6227     {
6228         for (int i = 0; i < width; ++i)
6229             stream << "DIG(" << (int)data[i] << ")";
6230         stream << "DIG(" << (int)data[width] << ")";
6231     }
6232     else if (depth == CV_32F)
6233     {
6234         stream.setf(std::ios_base::showpoint);
6235         for (int i = 0; i < width; ++i)
6236             stream << "DIG(" << data[i] << "f)";
6237         stream << "DIG(" << data[width] << "f)";
6238     }
6239     else
6240     {
6241         for (int i = 0; i < width; ++i)
6242             stream << "DIG(" << data[i] << ")";
6243         stream << "DIG(" << data[width] << ")";
6244     }
6245
6246     return stream.str();
6247 }
6248
6249 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6250 {
6251     Mat kernel = _kernel.getMat().reshape(1, 1);
6252
6253     int depth = kernel.depth();
6254     if (ddepth < 0)
6255         ddepth = depth;
6256
6257     if (ddepth != depth)
6258         kernel.convertTo(kernel, ddepth);
6259
6260     typedef std::string (* func_t)(const Mat &);
6261     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6262                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6263     const func_t func = funcs[ddepth];
6264     CV_Assert(func != 0);
6265
6266     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6267 }
6268
6269 #define PROCESS_SRC(src) \
6270     do \
6271     { \
6272         if (!src.empty()) \
6273         { \
6274             CV_Assert(src.isMat() || src.isUMat()); \
6275             Size csize = src.size(); \
6276             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6277                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6278             if (cwidth < ckercn || ckercn <= 0) \
6279                 return 1; \
6280             cols.push_back(cwidth); \
6281             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6282                 return 1; \
6283             offsets.push_back(src.offset()); \
6284             steps.push_back(src.step()); \
6285             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6286             kercns.push_back(ckercn); \
6287         } \
6288     } \
6289     while ((void)0, 0)
6290
6291 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6292                               InputArray src4, InputArray src5, InputArray src6,
6293                               InputArray src7, InputArray src8, InputArray src9,
6294                               OclVectorStrategy strat)
6295 {
6296     const ocl::Device & d = ocl::Device::getDefault();
6297
6298     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6299         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6300         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6301         d.preferredVectorWidthDouble(), -1 };
6302
6303     // if the device says don't use vectors
6304     if (vectorWidths[0] == 1)
6305     {
6306         // it's heuristic
6307         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6308         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6309         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6310     }
6311
6312     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6313 }
6314
6315 int checkOptimalVectorWidth(const int *vectorWidths,
6316                             InputArray src1, InputArray src2, InputArray src3,
6317                             InputArray src4, InputArray src5, InputArray src6,
6318                             InputArray src7, InputArray src8, InputArray src9,
6319                             OclVectorStrategy strat)
6320 {
6321     CV_Assert(vectorWidths);
6322
6323     int ref_type = src1.type();
6324
6325     std::vector<size_t> offsets, steps, cols;
6326     std::vector<int> dividers, kercns;
6327     PROCESS_SRC(src1);
6328     PROCESS_SRC(src2);
6329     PROCESS_SRC(src3);
6330     PROCESS_SRC(src4);
6331     PROCESS_SRC(src5);
6332     PROCESS_SRC(src6);
6333     PROCESS_SRC(src7);
6334     PROCESS_SRC(src8);
6335     PROCESS_SRC(src9);
6336
6337     size_t size = offsets.size();
6338
6339     for (size_t i = 0; i < size; ++i)
6340         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6341             dividers[i] >>= 1, kercns[i] >>= 1;
6342
6343     // default strategy
6344     int kercn = *std::min_element(kercns.begin(), kercns.end());
6345
6346     return kercn;
6347 }
6348
6349 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6350                                  InputArray src4, InputArray src5, InputArray src6,
6351                                  InputArray src7, InputArray src8, InputArray src9)
6352 {
6353     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6354 }
6355
6356 #undef PROCESS_SRC
6357
6358
6359 // TODO Make this as a method of OpenCL "BuildOptions" class
6360 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6361 {
6362     if (!buildOptions.empty())
6363         buildOptions += " ";
6364     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6365     buildOptions += format(
6366             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6367             name.c_str(), ocl::typeToStr(type),
6368             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6369             name.c_str(), (int)CV_MAT_CN(type),
6370             name.c_str(), (int)CV_ELEM_SIZE(type),
6371             name.c_str(), (int)CV_ELEM_SIZE1(type),
6372             name.c_str(), (int)depth
6373             );
6374 }
6375
6376
6377 struct Image2D::Impl
6378 {
6379     Impl(const UMat &src, bool norm, bool alias)
6380     {
6381         handle = 0;
6382         refcount = 1;
6383         init(src, norm, alias);
6384     }
6385
6386     ~Impl()
6387     {
6388         if (handle)
6389             clReleaseMemObject(handle);
6390     }
6391
6392     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6393     {
6394         cl_image_format format;
6395         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6396                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6397         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6398                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6399         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6400
6401         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6402         int channelOrder = channelOrders[cn];
6403         format.image_channel_data_type = (cl_channel_type)channelType;
6404         format.image_channel_order = (cl_channel_order)channelOrder;
6405         return format;
6406     }
6407
6408     static bool isFormatSupported(cl_image_format format)
6409     {
6410         if (!haveOpenCL())
6411             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6412
6413         cl_context context = (cl_context)Context::getDefault().ptr();
6414         if (!context)
6415             return false;
6416
6417         // Figure out how many formats are supported by this context.
6418         cl_uint numFormats = 0;
6419         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6420                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
6421                                                 NULL, &numFormats);
6422         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
6423         if (numFormats > 0)
6424         {
6425             AutoBuffer<cl_image_format> formats(numFormats);
6426             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6427                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
6428                                              formats.data(), NULL);
6429             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
6430             for (cl_uint i = 0; i < numFormats; ++i)
6431             {
6432                 if (!memcmp(&formats[i], &format, sizeof(format)))
6433                 {
6434                     return true;
6435                 }
6436             }
6437         }
6438         return false;
6439     }
6440
6441     void init(const UMat &src, bool norm, bool alias)
6442     {
6443         if (!haveOpenCL())
6444             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6445
6446         CV_Assert(!src.empty());
6447         CV_Assert(ocl::Device::getDefault().imageSupport());
6448
6449         int err, depth = src.depth(), cn = src.channels();
6450         CV_Assert(cn <= 4);
6451         cl_image_format format = getImageFormat(depth, cn, norm);
6452
6453         if (!isFormatSupported(format))
6454             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
6455
6456         if (alias && !src.handle(ACCESS_RW))
6457             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
6458
6459         cl_context context = (cl_context)Context::getDefault().ptr();
6460         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
6461
6462 #ifdef CL_VERSION_1_2
6463         // this enables backwards portability to
6464         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
6465         const Device & d = ocl::Device::getDefault();
6466         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
6467         CV_Assert(!alias || canCreateAlias(src));
6468         if (1 < major || (1 == major && 2 <= minor))
6469         {
6470             cl_image_desc desc;
6471             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
6472             desc.image_width      = src.cols;
6473             desc.image_height     = src.rows;
6474             desc.image_depth      = 0;
6475             desc.image_array_size = 1;
6476             desc.image_row_pitch  = alias ? src.step[0] : 0;
6477             desc.image_slice_pitch = 0;
6478             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
6479             desc.num_mip_levels   = 0;
6480             desc.num_samples      = 0;
6481             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
6482         }
6483         else
6484 #endif
6485         {
6486             CV_SUPPRESS_DEPRECATED_START
6487             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
6488             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
6489             CV_SUPPRESS_DEPRECATED_END
6490         }
6491         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
6492
6493         size_t origin[] = { 0, 0, 0 };
6494         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
6495
6496         cl_mem devData;
6497         if (!alias && !src.isContinuous())
6498         {
6499             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
6500             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
6501                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
6502                 ).c_str());
6503
6504             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
6505             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
6506                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
6507             CV_OCL_DBG_CHECK(clFlush(queue));
6508         }
6509         else
6510         {
6511             devData = (cl_mem)src.handle(ACCESS_READ);
6512         }
6513         CV_Assert(devData != NULL);
6514
6515         if (!alias)
6516         {
6517             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
6518             if (!src.isContinuous())
6519             {
6520                 CV_OCL_DBG_CHECK(clFlush(queue));
6521                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
6522             }
6523         }
6524     }
6525
6526     IMPLEMENT_REFCOUNTABLE();
6527
6528     cl_mem handle;
6529 };
6530
6531 Image2D::Image2D()
6532 {
6533     p = NULL;
6534 }
6535
6536 Image2D::Image2D(const UMat &src, bool norm, bool alias)
6537 {
6538     p = new Impl(src, norm, alias);
6539 }
6540
6541 bool Image2D::canCreateAlias(const UMat &m)
6542 {
6543     bool ret = false;
6544     const Device & d = ocl::Device::getDefault();
6545     if (d.imageFromBufferSupport() && !m.empty())
6546     {
6547         // This is the required pitch alignment in pixels
6548         uint pitchAlign = d.imagePitchAlignment();
6549         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
6550         {
6551             // We don't currently handle the case where the buffer was created
6552             // with CL_MEM_USE_HOST_PTR
6553             if (!m.u->tempUMat())
6554             {
6555                 ret = true;
6556             }
6557         }
6558     }
6559     return ret;
6560 }
6561
6562 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
6563 {
6564     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
6565
6566     return Impl::isFormatSupported(format);
6567 }
6568
6569 Image2D::Image2D(const Image2D & i)
6570 {
6571     p = i.p;
6572     if (p)
6573         p->addref();
6574 }
6575
6576 Image2D & Image2D::operator = (const Image2D & i)
6577 {
6578     if (i.p != p)
6579     {
6580         if (i.p)
6581             i.p->addref();
6582         if (p)
6583             p->release();
6584         p = i.p;
6585     }
6586     return *this;
6587 }
6588
6589 Image2D::~Image2D()
6590 {
6591     if (p)
6592         p->release();
6593 }
6594
6595 void* Image2D::ptr() const
6596 {
6597     return p ? p->handle : 0;
6598 }
6599
6600 bool internal::isOpenCLForced()
6601 {
6602     static bool initialized = false;
6603     static bool value = false;
6604     if (!initialized)
6605     {
6606         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
6607         initialized = true;
6608     }
6609     return value;
6610 }
6611
6612 bool internal::isPerformanceCheckBypassed()
6613 {
6614     static bool initialized = false;
6615     static bool value = false;
6616     if (!initialized)
6617     {
6618         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
6619         initialized = true;
6620     }
6621     return value;
6622 }
6623
6624 bool internal::isCLBuffer(UMat& u)
6625 {
6626     void* h = u.handle(ACCESS_RW);
6627     if (!h)
6628         return true;
6629     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
6630 #if 1
6631     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
6632         return false;
6633 #else
6634     cl_mem_object_type type = 0;
6635     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
6636     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
6637         return false;
6638 #endif
6639     return true;
6640 }
6641
6642 struct Timer::Impl
6643 {
6644     const Queue queue;
6645
6646     Impl(const Queue& q)
6647         : queue(q)
6648     {
6649     }
6650
6651     ~Impl(){}
6652
6653     void start()
6654     {
6655         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6656         timer.start();
6657     }
6658
6659     void stop()
6660     {
6661         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6662         timer.stop();
6663     }
6664
6665     uint64 durationNS() const
6666     {
6667         return (uint64)(timer.getTimeSec() * 1e9);
6668     }
6669
6670     TickMeter timer;
6671 };
6672
6673 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
6674 Timer::~Timer() { delete p; }
6675
6676 void Timer::start()
6677 {
6678     CV_Assert(p);
6679     p->start();
6680 }
6681
6682 void Timer::stop()
6683 {
6684     CV_Assert(p);
6685     p->stop();
6686 }
6687
6688 uint64 Timer::durationNS() const
6689 {
6690     CV_Assert(p);
6691     return p->durationNS();
6692 }
6693
6694 }} // namespace
6695
6696 #endif // HAVE_OPENCL