ocl: fix PlatformInfo usage
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832
833 struct OpenCLExecutionContext::Impl
834 {
835     ocl::Context context_;
836     int device_;  // device index in context
837     ocl::Queue queue_;
838     int useOpenCL_;
839
840 protected:
841     Impl() = delete;
842
843     void _init_device(cl_device_id deviceID)
844     {
845         CV_Assert(deviceID);
846         int ndevices = (int)context_.ndevices();
847         CV_Assert(ndevices > 0);
848         bool found = false;
849         for (int i = 0; i < ndevices; i++)
850         {
851             ocl::Device d = context_.device(i);
852             cl_device_id dhandle = (cl_device_id)d.ptr();
853             if (dhandle == deviceID)
854             {
855                 device_ = i;
856                 found = true;
857                 break;
858             }
859         }
860         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
861     }
862
863     void _init_device(const ocl::Device& device)
864     {
865         CV_Assert(device.ptr());
866         int ndevices = (int)context_.ndevices();
867         CV_Assert(ndevices > 0);
868         bool found = false;
869         for (int i = 0; i < ndevices; i++)
870         {
871             ocl::Device d = context_.device(i);
872             if (d.getImpl() == device.getImpl())
873             {
874                 device_ = i;
875                 found = true;
876                 break;
877             }
878         }
879         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
880     }
881
882 public:
883     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
884         : device_(0), useOpenCL_(-1)
885     {
886         CV_UNUSED(platformID);
887         CV_Assert(context);
888         CV_Assert(deviceID);
889
890         context_ = Context::fromHandle(context);
891         _init_device(deviceID);
892         queue_ = Queue(context_, context_.device(device_));
893     }
894
895     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
896         : device_(0), useOpenCL_(-1)
897     {
898         CV_Assert(context.ptr());
899         CV_Assert(device.ptr());
900
901         context_ = context;
902         _init_device(device);
903         queue_ = queue;
904     }
905
906     Impl(const ocl::Context& context, const ocl::Device& device)
907         : device_(0), useOpenCL_(-1)
908     {
909         CV_Assert(context.ptr());
910         CV_Assert(device.ptr());
911
912         context_ = context;
913         _init_device(device);
914         queue_ = Queue(context_, context_.device(device_));
915     }
916
917     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
918         : context_(context)
919         , device_(device)
920         , queue_(queue)
921         , useOpenCL_(-1)
922     {
923         // nothing
924     }
925     Impl(const Impl& other)
926         : context_(other.context_)
927         , device_(other.device_)
928         , queue_(other.queue_)
929         , useOpenCL_(-1)
930     {
931         // nothing
932     }
933
934     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
935     bool useOpenCL()
936     {
937         if (useOpenCL_ < 0)
938         {
939             try
940             {
941                 useOpenCL_ = 0;
942                 if (!context_.empty() && context_.ndevices() > 0)
943                 {
944                     const Device& d = context_.device(device_);
945                     useOpenCL_ = d.available();
946                 }
947             }
948             catch (const cv::Exception&)
949             {
950                 // nothing
951             }
952             if (!useOpenCL_)
953                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
954         }
955         return useOpenCL_ > 0;
956     }
957
958     void setUseOpenCL(bool flag)
959     {
960         if (!flag)
961             useOpenCL_ = 0;
962         else
963             useOpenCL_ = -1;
964     }
965
966     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
967     {
968         CV_TRACE_FUNCTION();
969
970         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
971
972         static bool initialized = false;
973         static std::shared_ptr<Impl> g_primaryExecutionContext;
974
975         if (!initialized)
976         {
977             cv::AutoLock lock(getInitializationMutex());
978             if (!initialized)
979             {
980                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
981                 try
982                 {
983                     Context c = ocl::Context::create(std::string());
984                     if (c.ndevices())
985                     {
986                         int deviceId = 0;
987                         auto& d = c.device(deviceId);
988                         if (d.available())
989                         {
990                             auto q = ocl::Queue(c, d);
991                             if (!q.ptr())
992                             {
993                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
994                             }
995                             else
996                             {
997                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
998                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
999                             }
1000                         }
1001                         else
1002                         {
1003                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1004                         }
1005                     }
1006                     else
1007                     {
1008                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1009                     }
1010                 }
1011                 catch (const std::exception& e)
1012                 {
1013                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1014                 }
1015                 catch (...)
1016                 {
1017                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1018                 }
1019                 initialized = true;
1020             }
1021         }
1022         return g_primaryExecutionContext;
1023     }
1024 };
1025
1026 Context& OpenCLExecutionContext::getContext() const
1027 {
1028     CV_Assert(p);
1029     return p->context_;
1030 }
1031 Device& OpenCLExecutionContext::getDevice() const
1032 {
1033     CV_Assert(p);
1034     return p->context_.device(p->device_);
1035 }
1036 Queue& OpenCLExecutionContext::getQueue() const
1037 {
1038     CV_Assert(p);
1039     return p->queue_;
1040 }
1041
1042 bool OpenCLExecutionContext::useOpenCL() const
1043 {
1044     if (p)
1045         return p->useOpenCL();
1046     return false;
1047 }
1048 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1049 {
1050     CV_Assert(p);
1051     p->setUseOpenCL(flag);
1052 }
1053
1054 /* static */
1055 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1056 {
1057     CV_TRACE_FUNCTION();
1058     CoreTLSData& data = getCoreTlsData();
1059     OpenCLExecutionContext& c = data.oclExecutionContext;
1060     if (!data.oclExecutionContextInitialized)
1061     {
1062         data.oclExecutionContextInitialized = true;
1063         if (c.empty() && haveOpenCL())
1064             c.p = Impl::getInitializedExecutionContext();
1065     }
1066     return c;
1067 }
1068
1069 /* static */
1070 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1071 {
1072     CV_TRACE_FUNCTION();
1073     CoreTLSData& data = getCoreTlsData();
1074     OpenCLExecutionContext& c = data.oclExecutionContext;
1075     return c;
1076 }
1077
1078 void OpenCLExecutionContext::bind() const
1079 {
1080     CV_TRACE_FUNCTION();
1081     CV_Assert(p);
1082     CoreTLSData& data = getCoreTlsData();
1083     data.oclExecutionContext = *this;
1084     data.oclExecutionContextInitialized = true;
1085     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1086 }
1087
1088
1089 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1090 {
1091     CV_TRACE_FUNCTION();
1092     CV_Assert(p);
1093     const Queue q(getContext(), getDevice());
1094     return cloneWithNewQueue(q);
1095 }
1096
1097 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1098 {
1099     CV_TRACE_FUNCTION();
1100     CV_Assert(p);
1101     CV_Assert(q.ptr() != NULL);
1102     OpenCLExecutionContext c;
1103     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1104     return c;
1105 }
1106
1107 /* static */
1108 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1109 {
1110     CV_TRACE_FUNCTION();
1111     if (!haveOpenCL())
1112         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1113
1114     CV_Assert(!context.empty());
1115     CV_Assert(context.ptr());
1116     CV_Assert(!device.empty());
1117     CV_Assert(device.ptr());
1118     OpenCLExecutionContext ctx;
1119     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1120     return ctx;
1121
1122 }
1123
1124 /* static */
1125 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1126 {
1127     CV_TRACE_FUNCTION();
1128     if (!haveOpenCL())
1129         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1130
1131     CV_Assert(!context.empty());
1132     CV_Assert(context.ptr());
1133     CV_Assert(!device.empty());
1134     CV_Assert(device.ptr());
1135     OpenCLExecutionContext ctx;
1136     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1137     return ctx;
1138
1139 }
1140
1141 void OpenCLExecutionContext::release()
1142 {
1143     CV_TRACE_FUNCTION();
1144     p.reset();
1145 }
1146
1147
1148 // true if we have initialized OpenCL subsystem with available platforms
1149 static bool g_isOpenCVActivated = false;
1150
1151 bool haveOpenCL()
1152 {
1153     CV_TRACE_FUNCTION();
1154     static bool g_isOpenCLInitialized = false;
1155     static bool g_isOpenCLAvailable = false;
1156
1157     if (!g_isOpenCLInitialized)
1158     {
1159         CV_TRACE_REGION("Init_OpenCL_Runtime");
1160         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1161         if (envPath)
1162         {
1163             if (cv::String(envPath) == "disabled")
1164             {
1165                 g_isOpenCLAvailable = false;
1166                 g_isOpenCLInitialized = true;
1167                 return false;
1168             }
1169         }
1170
1171         cv::AutoLock lock(getInitializationMutex());
1172         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1173         try
1174         {
1175             cl_uint n = 0;
1176             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1177             g_isOpenCVActivated = n > 0;
1178             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1179         }
1180         catch (...)
1181         {
1182             g_isOpenCLAvailable = false;
1183         }
1184         g_isOpenCLInitialized = true;
1185     }
1186     return g_isOpenCLAvailable;
1187 }
1188
1189 bool useOpenCL()
1190 {
1191     CoreTLSData& data = getCoreTlsData();
1192     if (data.useOpenCL < 0)
1193     {
1194         try
1195         {
1196             data.useOpenCL = 0;
1197             if (haveOpenCL())
1198             {
1199                 auto c = OpenCLExecutionContext::getCurrent();
1200                 data.useOpenCL = c.useOpenCL();
1201             }
1202         }
1203         catch (...)
1204         {
1205             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1206         }
1207     }
1208     return data.useOpenCL > 0;
1209 }
1210
1211 bool isOpenCLActivated()
1212 {
1213     if (!g_isOpenCVActivated)
1214         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1215     return useOpenCL();
1216 }
1217
1218 void setUseOpenCL(bool flag)
1219 {
1220     CV_TRACE_FUNCTION();
1221
1222     CoreTLSData& data = getCoreTlsData();
1223     auto& c = OpenCLExecutionContext::getCurrentRef();
1224     if (!c.empty())
1225     {
1226         c.setUseOpenCL(flag);
1227         data.useOpenCL = c.useOpenCL();
1228     }
1229     else
1230     {
1231         if (!flag)
1232             data.useOpenCL = 0;
1233         else
1234             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1235     }
1236 }
1237
1238
1239
1240 #ifdef HAVE_CLAMDBLAS
1241
1242 class AmdBlasHelper
1243 {
1244 public:
1245     static AmdBlasHelper & getInstance()
1246     {
1247         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1248     }
1249
1250     bool isAvailable() const
1251     {
1252         return g_isAmdBlasAvailable;
1253     }
1254
1255     ~AmdBlasHelper()
1256     {
1257         try
1258         {
1259             clAmdBlasTeardown();
1260         }
1261         catch (...) { }
1262     }
1263
1264 protected:
1265     AmdBlasHelper()
1266     {
1267         if (!g_isAmdBlasInitialized)
1268         {
1269             AutoLock lock(getInitializationMutex());
1270
1271             if (!g_isAmdBlasInitialized)
1272             {
1273                 if (haveOpenCL())
1274                 {
1275                     try
1276                     {
1277                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1278                     }
1279                     catch (...)
1280                     {
1281                         g_isAmdBlasAvailable = false;
1282                     }
1283                 }
1284                 else
1285                     g_isAmdBlasAvailable = false;
1286
1287                 g_isAmdBlasInitialized = true;
1288             }
1289         }
1290     }
1291
1292 private:
1293     static bool g_isAmdBlasInitialized;
1294     static bool g_isAmdBlasAvailable;
1295 };
1296
1297 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1298 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1299
1300 bool haveAmdBlas()
1301 {
1302     return AmdBlasHelper::getInstance().isAvailable();
1303 }
1304
1305 #else
1306
1307 bool haveAmdBlas()
1308 {
1309     return false;
1310 }
1311
1312 #endif
1313
1314 #ifdef HAVE_CLAMDFFT
1315
1316 class AmdFftHelper
1317 {
1318 public:
1319     static AmdFftHelper & getInstance()
1320     {
1321         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1322     }
1323
1324     bool isAvailable() const
1325     {
1326         return g_isAmdFftAvailable;
1327     }
1328
1329     ~AmdFftHelper()
1330     {
1331         try
1332         {
1333 //            clAmdFftTeardown();
1334         }
1335         catch (...) { }
1336     }
1337
1338 protected:
1339     AmdFftHelper()
1340     {
1341         if (!g_isAmdFftInitialized)
1342         {
1343             AutoLock lock(getInitializationMutex());
1344
1345             if (!g_isAmdFftInitialized)
1346             {
1347                 if (haveOpenCL())
1348                 {
1349                     try
1350                     {
1351                         cl_uint major, minor, patch;
1352                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1353
1354                         // it throws exception in case AmdFft binaries are not found
1355                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1356                         g_isAmdFftAvailable = true;
1357                     }
1358                     catch (const Exception &)
1359                     {
1360                         g_isAmdFftAvailable = false;
1361                     }
1362                 }
1363                 else
1364                     g_isAmdFftAvailable = false;
1365
1366                 g_isAmdFftInitialized = true;
1367             }
1368         }
1369     }
1370
1371 private:
1372     static clAmdFftSetupData setupData;
1373     static bool g_isAmdFftInitialized;
1374     static bool g_isAmdFftAvailable;
1375 };
1376
1377 clAmdFftSetupData AmdFftHelper::setupData;
1378 bool AmdFftHelper::g_isAmdFftAvailable = false;
1379 bool AmdFftHelper::g_isAmdFftInitialized = false;
1380
1381 bool haveAmdFft()
1382 {
1383     return AmdFftHelper::getInstance().isAvailable();
1384 }
1385
1386 #else
1387
1388 bool haveAmdFft()
1389 {
1390     return false;
1391 }
1392
1393 #endif
1394
1395 bool haveSVM()
1396 {
1397 #ifdef HAVE_OPENCL_SVM
1398     return true;
1399 #else
1400     return false;
1401 #endif
1402 }
1403
1404 void finish()
1405 {
1406     Queue::getDefault().finish();
1407 }
1408
1409 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1410
1411 struct Platform::Impl
1412 {
1413     Impl()
1414     {
1415         refcount = 1;
1416         handle = 0;
1417         initialized = false;
1418     }
1419
1420     ~Impl() {}
1421
1422     void init()
1423     {
1424         if( !initialized )
1425         {
1426             //cl_uint num_entries
1427             cl_uint n = 0;
1428             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1429                 handle = 0;
1430             if( handle != 0 )
1431             {
1432                 char buf[1000];
1433                 size_t len = 0;
1434                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1435                 buf[len] = '\0';
1436                 vendor = String(buf);
1437             }
1438
1439             initialized = true;
1440         }
1441     }
1442
1443     IMPLEMENT_REFCOUNTABLE();
1444
1445     cl_platform_id handle;
1446     String vendor;
1447     bool initialized;
1448 };
1449
1450 Platform::Platform()
1451 {
1452     p = 0;
1453 }
1454
1455 Platform::~Platform()
1456 {
1457     if(p)
1458         p->release();
1459 }
1460
1461 Platform::Platform(const Platform& pl)
1462 {
1463     p = (Impl*)pl.p;
1464     if(p)
1465         p->addref();
1466 }
1467
1468 Platform& Platform::operator = (const Platform& pl)
1469 {
1470     Impl* newp = (Impl*)pl.p;
1471     if(newp)
1472         newp->addref();
1473     if(p)
1474         p->release();
1475     p = newp;
1476     return *this;
1477 }
1478
1479 void* Platform::ptr() const
1480 {
1481     return p ? p->handle : 0;
1482 }
1483
1484 Platform& Platform::getDefault()
1485 {
1486     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1487     static Platform p;
1488     if( !p.p )
1489     {
1490         p.p = new Impl;
1491         p.p->init();
1492     }
1493     return p;
1494 }
1495
1496 /////////////////////////////////////// Device ////////////////////////////////////////////
1497
1498 // deviceVersion has format
1499 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1500 // by specification
1501 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1502 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1503 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1504 {
1505     major = minor = 0;
1506     if (10 >= deviceVersion.length())
1507         return;
1508     const char *pstr = deviceVersion.c_str();
1509     if (0 != strncmp(pstr, "OpenCL ", 7))
1510         return;
1511     size_t ppos = deviceVersion.find('.', 7);
1512     if (String::npos == ppos)
1513         return;
1514     String temp = deviceVersion.substr(7, ppos - 7);
1515     major = atoi(temp.c_str());
1516     temp = deviceVersion.substr(ppos + 1);
1517     minor = atoi(temp.c_str());
1518 }
1519
1520 struct Device::Impl
1521 {
1522     Impl(void* d)
1523         : refcount(1)
1524         , handle(0)
1525     {
1526         try
1527         {
1528             cl_device_id device = (cl_device_id)d;
1529             _init(device);
1530             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1531         }
1532         catch (...)
1533         {
1534             throw;
1535         }
1536     }
1537
1538     void _init(cl_device_id d)
1539     {
1540         handle = (cl_device_id)d;
1541
1542         name_ = getStrProp(CL_DEVICE_NAME);
1543         version_ = getStrProp(CL_DEVICE_VERSION);
1544         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1545         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1546         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1547         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1548         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1549         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1550         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1551         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1552
1553         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1554         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1555
1556         size_t pos = 0;
1557         while (pos < extensions_.size())
1558         {
1559             size_t pos2 = extensions_.find(' ', pos);
1560             if (pos2 == String::npos)
1561                 pos2 = extensions_.size();
1562             if (pos2 > pos)
1563             {
1564                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1565                 extensions_set_.insert(extensionName);
1566             }
1567             pos = pos2 + 1;
1568         }
1569
1570         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1571
1572         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1573         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1574             vendorName_ == "AMD")
1575             vendorID_ = VENDOR_AMD;
1576         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1577             vendorID_ = VENDOR_INTEL;
1578         else if (vendorName_ == "NVIDIA Corporation")
1579             vendorID_ = VENDOR_NVIDIA;
1580         else
1581             vendorID_ = UNKNOWN_VENDOR;
1582
1583         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1584         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1585         {
1586             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1587             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1588                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1589             maxWorkGroupSize_ = new_maxWorkGroupSize;
1590         }
1591 #if 0
1592         if (isExtensionSupported("cl_khr_spir"))
1593         {
1594 #ifndef CL_DEVICE_SPIR_VERSIONS
1595 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1596 #endif
1597             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1598             std::cout << spir_versions << std::endl;
1599         }
1600 #endif
1601     }
1602
1603     ~Impl()
1604     {
1605 #ifdef _WIN32
1606         if (!cv::__termination)
1607 #endif
1608         {
1609             if (handle)
1610             {
1611                 CV_OCL_CHECK(clReleaseDevice(handle));
1612                 handle = 0;
1613             }
1614         }
1615     }
1616
1617     template<typename _TpCL, typename _TpOut>
1618     _TpOut getProp(cl_device_info prop) const
1619     {
1620         _TpCL temp=_TpCL();
1621         size_t sz = 0;
1622
1623         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1624             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1625     }
1626
1627     bool getBoolProp(cl_device_info prop) const
1628     {
1629         cl_bool temp = CL_FALSE;
1630         size_t sz = 0;
1631
1632         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1633             sz == sizeof(temp) ? temp != 0 : false;
1634     }
1635
1636     String getStrProp(cl_device_info prop) const
1637     {
1638         char buf[4096];
1639         size_t sz=0;
1640         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1641             sz < sizeof(buf) ? String(buf) : String();
1642     }
1643
1644     bool isExtensionSupported(const std::string& extensionName) const
1645     {
1646         return extensions_set_.count(extensionName) > 0;
1647     }
1648
1649
1650     IMPLEMENT_REFCOUNTABLE();
1651
1652     cl_device_id handle;
1653
1654     String name_;
1655     String version_;
1656     std::string extensions_;
1657     int doubleFPConfig_;
1658     bool hostUnifiedMemory_;
1659     int maxComputeUnits_;
1660     size_t maxWorkGroupSize_;
1661     int type_;
1662     int addressBits_;
1663     int deviceVersionMajor_;
1664     int deviceVersionMinor_;
1665     String driverVersion_;
1666     String vendorName_;
1667     int vendorID_;
1668     bool intelSubgroupsSupport_;
1669
1670     std::set<std::string> extensions_set_;
1671 };
1672
1673
1674 Device::Device()
1675 {
1676     p = 0;
1677 }
1678
1679 Device::Device(void* d)
1680 {
1681     p = 0;
1682     set(d);
1683 }
1684
1685 Device::Device(const Device& d)
1686 {
1687     p = d.p;
1688     if(p)
1689         p->addref();
1690 }
1691
1692 Device& Device::operator = (const Device& d)
1693 {
1694     Impl* newp = (Impl*)d.p;
1695     if(newp)
1696         newp->addref();
1697     if(p)
1698         p->release();
1699     p = newp;
1700     return *this;
1701 }
1702
1703 Device::~Device()
1704 {
1705     if(p)
1706         p->release();
1707 }
1708
1709 void Device::set(void* d)
1710 {
1711     if(p)
1712         p->release();
1713     p = new Impl(d);
1714     if (p->handle)
1715     {
1716         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1717     }
1718 }
1719
1720 Device Device::fromHandle(void* d)
1721 {
1722     Device device(d);
1723     return device;
1724 }
1725
1726 void* Device::ptr() const
1727 {
1728     return p ? p->handle : 0;
1729 }
1730
1731 String Device::name() const
1732 { return p ? p->name_ : String(); }
1733
1734 String Device::extensions() const
1735 { return p ? String(p->extensions_) : String(); }
1736
1737 bool Device::isExtensionSupported(const String& extensionName) const
1738 { return p ? p->isExtensionSupported(extensionName) : false; }
1739
1740 String Device::version() const
1741 { return p ? p->version_ : String(); }
1742
1743 String Device::vendorName() const
1744 { return p ? p->vendorName_ : String(); }
1745
1746 int Device::vendorID() const
1747 { return p ? p->vendorID_ : 0; }
1748
1749 String Device::OpenCL_C_Version() const
1750 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1751
1752 String Device::OpenCLVersion() const
1753 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1754
1755 int Device::deviceVersionMajor() const
1756 { return p ? p->deviceVersionMajor_ : 0; }
1757
1758 int Device::deviceVersionMinor() const
1759 { return p ? p->deviceVersionMinor_ : 0; }
1760
1761 String Device::driverVersion() const
1762 { return p ? p->driverVersion_ : String(); }
1763
1764 int Device::type() const
1765 { return p ? p->type_ : 0; }
1766
1767 int Device::addressBits() const
1768 { return p ? p->addressBits_ : 0; }
1769
1770 bool Device::available() const
1771 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1772
1773 bool Device::compilerAvailable() const
1774 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1775
1776 bool Device::linkerAvailable() const
1777 #ifdef CL_VERSION_1_2
1778 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1779 #else
1780 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1781 #endif
1782
1783 int Device::doubleFPConfig() const
1784 { return p ? p->doubleFPConfig_ : 0; }
1785
1786 int Device::singleFPConfig() const
1787 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1788
1789 int Device::halfFPConfig() const
1790 #ifdef CL_VERSION_1_2
1791 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1792 #else
1793 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1794 #endif
1795
1796 bool Device::endianLittle() const
1797 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1798
1799 bool Device::errorCorrectionSupport() const
1800 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1801
1802 int Device::executionCapabilities() const
1803 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1804
1805 size_t Device::globalMemCacheSize() const
1806 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1807
1808 int Device::globalMemCacheType() const
1809 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1810
1811 int Device::globalMemCacheLineSize() const
1812 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1813
1814 size_t Device::globalMemSize() const
1815 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1816
1817 size_t Device::localMemSize() const
1818 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1819
1820 int Device::localMemType() const
1821 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1822
1823 bool Device::hostUnifiedMemory() const
1824 { return p ? p->hostUnifiedMemory_ : false; }
1825
1826 bool Device::imageSupport() const
1827 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1828
1829 bool Device::imageFromBufferSupport() const
1830 {
1831     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1832 }
1833
1834 uint Device::imagePitchAlignment() const
1835 {
1836 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1837     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1838 #else
1839     return 0;
1840 #endif
1841 }
1842
1843 uint Device::imageBaseAddressAlignment() const
1844 {
1845 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1846     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1847 #else
1848     return 0;
1849 #endif
1850 }
1851
1852 size_t Device::image2DMaxWidth() const
1853 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1854
1855 size_t Device::image2DMaxHeight() const
1856 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1857
1858 size_t Device::image3DMaxWidth() const
1859 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1860
1861 size_t Device::image3DMaxHeight() const
1862 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1863
1864 size_t Device::image3DMaxDepth() const
1865 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1866
1867 size_t Device::imageMaxBufferSize() const
1868 #ifdef CL_VERSION_1_2
1869 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1870 #else
1871 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1872 #endif
1873
1874 size_t Device::imageMaxArraySize() const
1875 #ifdef CL_VERSION_1_2
1876 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1877 #else
1878 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1879 #endif
1880
1881 bool Device::intelSubgroupsSupport() const
1882 { return p ? p->intelSubgroupsSupport_ : false; }
1883
1884 int Device::maxClockFrequency() const
1885 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1886
1887 int Device::maxComputeUnits() const
1888 { return p ? p->maxComputeUnits_ : 0; }
1889
1890 int Device::maxConstantArgs() const
1891 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1892
1893 size_t Device::maxConstantBufferSize() const
1894 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1895
1896 size_t Device::maxMemAllocSize() const
1897 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1898
1899 size_t Device::maxParameterSize() const
1900 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1901
1902 int Device::maxReadImageArgs() const
1903 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1904
1905 int Device::maxWriteImageArgs() const
1906 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1907
1908 int Device::maxSamplers() const
1909 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1910
1911 size_t Device::maxWorkGroupSize() const
1912 { return p ? p->maxWorkGroupSize_ : 0; }
1913
1914 int Device::maxWorkItemDims() const
1915 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1916
1917 void Device::maxWorkItemSizes(size_t* sizes) const
1918 {
1919     if(p)
1920     {
1921         const int MAX_DIMS = 32;
1922         size_t retsz = 0;
1923         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1924                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1925     }
1926 }
1927
1928 int Device::memBaseAddrAlign() const
1929 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1930
1931 int Device::nativeVectorWidthChar() const
1932 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1933
1934 int Device::nativeVectorWidthShort() const
1935 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1936
1937 int Device::nativeVectorWidthInt() const
1938 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1939
1940 int Device::nativeVectorWidthLong() const
1941 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1942
1943 int Device::nativeVectorWidthFloat() const
1944 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1945
1946 int Device::nativeVectorWidthDouble() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1948
1949 int Device::nativeVectorWidthHalf() const
1950 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1951
1952 int Device::preferredVectorWidthChar() const
1953 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1954
1955 int Device::preferredVectorWidthShort() const
1956 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1957
1958 int Device::preferredVectorWidthInt() const
1959 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1960
1961 int Device::preferredVectorWidthLong() const
1962 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1963
1964 int Device::preferredVectorWidthFloat() const
1965 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1966
1967 int Device::preferredVectorWidthDouble() const
1968 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1969
1970 int Device::preferredVectorWidthHalf() const
1971 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1972
1973 size_t Device::printfBufferSize() const
1974 #ifdef CL_VERSION_1_2
1975 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1976 #else
1977 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1978 #endif
1979
1980
1981 size_t Device::profilingTimerResolution() const
1982 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1983
1984 const Device& Device::getDefault()
1985 {
1986     auto& c = OpenCLExecutionContext::getCurrent();
1987     if (!c.empty())
1988     {
1989         return c.getDevice();
1990     }
1991
1992     static Device dummy;
1993     return dummy;
1994 }
1995
1996 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1997
1998 template <typename Functor, typename ObjectType>
1999 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2000 {
2001     ::size_t required;
2002     cl_int err = f(obj, name, 0, NULL, &required);
2003     if (err != CL_SUCCESS)
2004         return err;
2005
2006     param.clear();
2007     if (required > 0)
2008     {
2009         AutoBuffer<char> buf(required + 1);
2010         char* ptr = buf.data(); // cleanup is not needed
2011         err = f(obj, name, required, ptr, NULL);
2012         if (err != CL_SUCCESS)
2013             return err;
2014         param = ptr;
2015     }
2016
2017     return CL_SUCCESS;
2018 }
2019
2020 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2021 {
2022     elems.clear();
2023     if (s.size() == 0)
2024         return;
2025     std::istringstream ss(s);
2026     std::string item;
2027     while (!ss.eof())
2028     {
2029         std::getline(ss, item, delim);
2030         elems.push_back(item);
2031     }
2032 }
2033
2034 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2035 // Sample: AMD:GPU:
2036 // Sample: AMD:GPU:Tahiti
2037 // Sample: :GPU|CPU: = '' = ':' = '::'
2038 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2039         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2040 {
2041     std::vector<std::string> parts;
2042     split(configurationStr, ':', parts);
2043     if (parts.size() > 3)
2044     {
2045         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2046         return false;
2047     }
2048     if (parts.size() > 2)
2049         deviceNameOrID = parts[2];
2050     if (parts.size() > 1)
2051     {
2052         split(parts[1], '|', deviceTypes);
2053     }
2054     if (parts.size() > 0)
2055     {
2056         platform = parts[0];
2057     }
2058     return true;
2059 }
2060
2061 #if defined WINRT || defined _WIN32_WCE
2062 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2063 {
2064     CV_UNUSED(configuration)
2065     return NULL;
2066 }
2067 #else
2068 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2069 {
2070     std::string platform, deviceName;
2071     std::vector<std::string> deviceTypes;
2072
2073     if (!configuration)
2074         configuration = getenv("OPENCV_OPENCL_DEVICE");
2075
2076     if (configuration &&
2077             (strcmp(configuration, "disabled") == 0 ||
2078              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2079             ))
2080         return NULL;
2081
2082     bool isID = false;
2083     int deviceID = -1;
2084     if (deviceName.length() == 1)
2085     // We limit ID range to 0..9, because we want to write:
2086     // - '2500' to mean i5-2500
2087     // - '8350' to mean AMD FX-8350
2088     // - '650' to mean GeForce 650
2089     // To extend ID range change condition to '> 0'
2090     {
2091         isID = true;
2092         for (size_t i = 0; i < deviceName.length(); i++)
2093         {
2094             if (!isdigit(deviceName[i]))
2095             {
2096                 isID = false;
2097                 break;
2098             }
2099         }
2100         if (isID)
2101         {
2102             deviceID = atoi(deviceName.c_str());
2103             if (deviceID < 0)
2104                 return NULL;
2105         }
2106     }
2107
2108     std::vector<cl_platform_id> platforms;
2109     {
2110         cl_uint numPlatforms = 0;
2111         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2112
2113         if (numPlatforms == 0)
2114             return NULL;
2115         platforms.resize((size_t)numPlatforms);
2116         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2117         platforms.resize(numPlatforms);
2118     }
2119
2120     int selectedPlatform = -1;
2121     if (platform.length() > 0)
2122     {
2123         for (size_t i = 0; i < platforms.size(); i++)
2124         {
2125             std::string name;
2126             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2127             if (name.find(platform) != std::string::npos)
2128             {
2129                 selectedPlatform = (int)i;
2130                 break;
2131             }
2132         }
2133         if (selectedPlatform == -1)
2134         {
2135             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2136             goto not_found;
2137         }
2138     }
2139     if (deviceTypes.size() == 0)
2140     {
2141         if (!isID)
2142         {
2143             deviceTypes.push_back("GPU");
2144             if (configuration)
2145                 deviceTypes.push_back("CPU");
2146         }
2147         else
2148             deviceTypes.push_back("ALL");
2149     }
2150     for (size_t t = 0; t < deviceTypes.size(); t++)
2151     {
2152         int deviceType = 0;
2153         std::string tempStrDeviceType = deviceTypes[t];
2154         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2155
2156         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2157             deviceType = Device::TYPE_GPU;
2158         else if (tempStrDeviceType == "cpu")
2159             deviceType = Device::TYPE_CPU;
2160         else if (tempStrDeviceType == "accelerator")
2161             deviceType = Device::TYPE_ACCELERATOR;
2162         else if (tempStrDeviceType == "all")
2163             deviceType = Device::TYPE_ALL;
2164         else
2165         {
2166             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2167             goto not_found;
2168         }
2169
2170         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2171         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2172                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2173                 i++)
2174         {
2175             cl_uint count = 0;
2176             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2177             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2178             {
2179                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2180             }
2181             if (count == 0)
2182                 continue;
2183             size_t base = devices.size();
2184             devices.resize(base + count);
2185             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2186             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2187             {
2188                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2189             }
2190         }
2191
2192         for (size_t i = (isID ? deviceID : 0);
2193              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2194              i++)
2195         {
2196             std::string name;
2197             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2198             cl_bool useGPU = true;
2199             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2200             {
2201                 cl_bool isIGPU = CL_FALSE;
2202                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2203                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2204             }
2205             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2206             {
2207                 // TODO check for OpenCL 1.1
2208                 return devices[i];
2209             }
2210         }
2211     }
2212
2213 not_found:
2214     if (!configuration)
2215         return NULL; // suppress messages on stderr
2216
2217     std::ostringstream msg;
2218     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2219         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2220         << "    Device types:";
2221     for (size_t t = 0; t < deviceTypes.size(); t++)
2222         msg << ' ' << deviceTypes[t];
2223
2224     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2225
2226     CV_LOG_ERROR(NULL, msg.str());
2227     return NULL;
2228 }
2229 #endif
2230
2231 #ifdef HAVE_OPENCL_SVM
2232 namespace svm {
2233
2234 enum AllocatorFlags { // don't use first 16 bits
2235         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2236         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2237         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2238         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2239         OPENCL_SVM_BUFFER_MAP = 4 << 16
2240 };
2241
2242 static bool checkForceSVMUmatUsage()
2243 {
2244     static bool initialized = false;
2245     static bool force = false;
2246     if (!initialized)
2247     {
2248         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2249         initialized = true;
2250     }
2251     return force;
2252 }
2253 static bool checkDisableSVMUMatUsage()
2254 {
2255     static bool initialized = false;
2256     static bool force = false;
2257     if (!initialized)
2258     {
2259         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2260         initialized = true;
2261     }
2262     return force;
2263 }
2264 static bool checkDisableSVM()
2265 {
2266     static bool initialized = false;
2267     static bool force = false;
2268     if (!initialized)
2269     {
2270         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2271         initialized = true;
2272     }
2273     return force;
2274 }
2275 // see SVMCapabilities
2276 static unsigned int getSVMCapabilitiesMask()
2277 {
2278     static bool initialized = false;
2279     static unsigned int mask = 0;
2280     if (!initialized)
2281     {
2282         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2283         if (envValue == NULL)
2284         {
2285             return ~0U; // all bits 1
2286         }
2287         mask = atoi(envValue);
2288         initialized = true;
2289     }
2290     return mask;
2291 }
2292 } // namespace
2293 #endif
2294
2295 static size_t getProgramCountLimit()
2296 {
2297     static bool initialized = false;
2298     static size_t count = 0;
2299     if (!initialized)
2300     {
2301         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2302         initialized = true;
2303     }
2304     return count;
2305 }
2306
2307 static int g_contextId = 0;
2308
2309 class OpenCLBufferPoolImpl;
2310 class OpenCLSVMBufferPoolImpl;
2311
2312 struct Context::Impl
2313 {
2314     static Context::Impl* get(Context& context) { return context.p; }
2315
2316     typedef std::deque<Context::Impl*> container_t;
2317     static container_t& getGlobalContainer()
2318     {
2319         // never delete this container (Impl lifetime is greater due to TLS storage)
2320         static container_t* g_contexts = new container_t();
2321         return *g_contexts;
2322     }
2323
2324 protected:
2325     Impl(const std::string& configuration_)
2326         : refcount(1)
2327         , contextId(CV_XADD(&g_contextId, 1))
2328         , configuration(configuration_)
2329         , handle(0)
2330 #ifdef HAVE_OPENCL_SVM
2331         , svmInitialized(false)
2332 #endif
2333     {
2334         if (!haveOpenCL())
2335             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2336
2337         cv::AutoLock lock(cv::getInitializationMutex());
2338         auto& container = getGlobalContainer();
2339         container.resize(std::max(container.size(), (size_t)contextId + 1));
2340         container[contextId] = this;
2341     }
2342
2343     ~Impl()
2344     {
2345 #ifdef _WIN32
2346         if (!cv::__termination)
2347 #endif
2348         {
2349             if (handle)
2350             {
2351                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2352                 handle = NULL;
2353             }
2354             devices.clear();
2355         }
2356
2357         {
2358             cv::AutoLock lock(cv::getInitializationMutex());
2359             auto& container = getGlobalContainer();
2360             CV_CheckLT((size_t)contextId, container.size(), "");
2361             container[contextId] = NULL;
2362         }
2363     }
2364
2365     void init_device_list()
2366     {
2367         CV_Assert(handle);
2368
2369         cl_uint ndevices = 0;
2370         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2371         CV_Assert(ndevices > 0);
2372
2373         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2374         size_t devices_ret_size = 0;
2375         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2376         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2377
2378         devices.clear();
2379         for (unsigned i = 0; i < ndevices; i++)
2380         {
2381             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2382         }
2383     }
2384
2385     void __init_buffer_pools();  // w/o synchronization
2386     void _init_buffer_pools() const
2387     {
2388         if (!bufferPool_)
2389         {
2390             cv::AutoLock lock(cv::getInitializationMutex());
2391             if (!bufferPool_)
2392             {
2393                 const_cast<Impl*>(this)->__init_buffer_pools();
2394             }
2395         }
2396     }
2397 public:
2398     static Impl* findContext(const std::string& configuration)
2399     {
2400         CV_TRACE_FUNCTION();
2401         cv::AutoLock lock(cv::getInitializationMutex());
2402         auto& container = getGlobalContainer();
2403         if (configuration.empty() && !container.empty())
2404             return container[0];
2405         for (auto it = container.begin(); it != container.end(); ++it)
2406         {
2407             Impl* i = *it;
2408             if (i && i->configuration == configuration)
2409             {
2410                 return i;
2411             }
2412         }
2413         return NULL;
2414     }
2415
2416     static Impl* findOrCreateContext(const std::string& configuration_)
2417     {
2418         CV_TRACE_FUNCTION();
2419         std::string configuration = configuration_;
2420         if (configuration_.empty())
2421         {
2422             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2423             if (c)
2424                 configuration = c;
2425         }
2426         Impl* impl = findContext(configuration);
2427         if (impl)
2428         {
2429             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2430             return impl;
2431         }
2432
2433         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2434         if (d == NULL)
2435             return NULL;
2436
2437         impl = new Impl(configuration);
2438         try
2439         {
2440             impl->createFromDevice(d);
2441             if (impl->handle)
2442                 return impl;
2443             delete impl;
2444             return NULL;
2445         }
2446         catch (...)
2447         {
2448             delete impl;
2449             throw;
2450         }
2451     }
2452
2453     static Impl* findOrCreateContext(cl_context h)
2454     {
2455         CV_TRACE_FUNCTION();
2456
2457         CV_Assert(h);
2458
2459         std::string configuration = cv::format("@ctx-%p", (void*)h);
2460         Impl* impl = findContext(configuration);
2461         if (impl)
2462         {
2463             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2464             impl->addref();
2465             return impl;
2466         }
2467
2468         impl = new Impl(configuration);
2469         try
2470         {
2471             CV_OCL_CHECK(clRetainContext(h));
2472             impl->handle = h;
2473             impl->init_device_list();
2474             return impl;
2475         }
2476         catch (...)
2477         {
2478             delete impl;
2479             throw;
2480         }
2481     }
2482
2483     static Impl* findOrCreateContext(const ocl::Device& device)
2484     {
2485         CV_TRACE_FUNCTION();
2486
2487         CV_Assert(!device.empty());
2488         cl_device_id d = (cl_device_id)device.ptr();
2489         CV_Assert(d);
2490
2491         std::string configuration = cv::format("@dev-%p", (void*)d);
2492         Impl* impl = findContext(configuration);
2493         if (impl)
2494         {
2495             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2496             impl->addref();
2497             return impl;
2498         }
2499
2500         impl = new Impl(configuration);
2501         try
2502         {
2503             impl->createFromDevice(d);
2504             CV_Assert(impl->handle);
2505             return impl;
2506         }
2507         catch (...)
2508         {
2509             delete impl;
2510             throw;
2511         }
2512     }
2513
2514     void setDefault()
2515     {
2516         CV_TRACE_FUNCTION();
2517         cl_device_id d = selectOpenCLDevice();
2518
2519         if (d == NULL)
2520             return;
2521
2522         createFromDevice(d);
2523     }
2524
2525     void createFromDevice(cl_device_id d)
2526     {
2527         CV_TRACE_FUNCTION();
2528         CV_Assert(handle == NULL);
2529
2530         cl_platform_id pl = NULL;
2531         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2532
2533         cl_context_properties prop[] =
2534         {
2535             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2536             0
2537         };
2538
2539         // !!! in the current implementation force the number of devices to 1 !!!
2540         cl_uint nd = 1;
2541         cl_int status;
2542
2543         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2544         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2545
2546         bool ok = handle != 0 && status == CL_SUCCESS;
2547         if( ok )
2548         {
2549             devices.resize(nd);
2550             devices[0].set(d);
2551         }
2552         else
2553             handle = NULL;
2554     }
2555
2556     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2557
2558     void unloadProg(Program& prog)
2559     {
2560         cv::AutoLock lock(program_cache_mutex);
2561         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2562         {
2563               phash_t::iterator it = phash.find(*i);
2564               if (it != phash.end())
2565               {
2566                   if (it->second.ptr() == prog.ptr())
2567                   {
2568                       phash.erase(*i);
2569                       cacheList.erase(i);
2570                       return;
2571                   }
2572               }
2573         }
2574     }
2575
2576     std::string& getPrefixString()
2577     {
2578         if (prefix.empty())
2579         {
2580             cv::AutoLock lock(program_cache_mutex);
2581             if (prefix.empty())
2582             {
2583                 CV_Assert(!devices.empty());
2584                 const Device& d = devices[0];
2585                 int bits = d.addressBits();
2586                 if (bits > 0 && bits != 64)
2587                     prefix = cv::format("%d-bit--", bits);
2588                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2589                 // sanitize chars
2590                 for (size_t i = 0; i < prefix.size(); i++)
2591                 {
2592                     char c = prefix[i];
2593                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2594                     {
2595                         prefix[i] = '_';
2596                     }
2597                 }
2598             }
2599         }
2600         return prefix;
2601     }
2602
2603     std::string& getPrefixBase()
2604     {
2605         if (prefix_base.empty())
2606         {
2607             cv::AutoLock lock(program_cache_mutex);
2608             if (prefix_base.empty())
2609             {
2610                 const Device& d = devices[0];
2611                 int bits = d.addressBits();
2612                 if (bits > 0 && bits != 64)
2613                     prefix_base = cv::format("%d-bit--", bits);
2614                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2615                 // sanitize chars
2616                 for (size_t i = 0; i < prefix_base.size(); i++)
2617                 {
2618                     char c = prefix_base[i];
2619                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2620                     {
2621                         prefix_base[i] = '_';
2622                     }
2623                 }
2624             }
2625         }
2626         return prefix_base;
2627     }
2628
2629     IMPLEMENT_REFCOUNTABLE();
2630
2631     const int contextId;  // global unique ID
2632     const std::string configuration;
2633
2634     cl_context handle;
2635     std::vector<Device> devices;
2636
2637     std::string prefix;
2638     std::string prefix_base;
2639
2640     cv::Mutex program_cache_mutex;
2641     typedef std::map<std::string, Program> phash_t;
2642     phash_t phash;
2643     typedef std::list<cv::String> CacheList;
2644     CacheList cacheList;
2645
2646     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2647     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2648     OpenCLBufferPoolImpl& getBufferPool() const
2649     {
2650         _init_buffer_pools();
2651         CV_DbgAssert(bufferPool_);
2652         return *bufferPool_.get();
2653     }
2654     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2655     {
2656         _init_buffer_pools();
2657         CV_DbgAssert(bufferPoolHostPtr_);
2658         return *bufferPoolHostPtr_.get();
2659     }
2660
2661 #ifdef HAVE_OPENCL_SVM
2662     bool svmInitialized;
2663     bool svmAvailable;
2664     bool svmEnabled;
2665     svm::SVMCapabilities svmCapabilities;
2666     svm::SVMFunctions svmFunctions;
2667
2668     void svmInit()
2669     {
2670         CV_Assert(handle != NULL);
2671         const Device& device = devices[0];
2672         cl_device_svm_capabilities deviceCaps = 0;
2673         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2674         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2675         if (status != CL_SUCCESS)
2676         {
2677             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2678             goto noSVM;
2679         }
2680         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2681         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2682         svmCapabilities.value_ =
2683                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2684                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2685                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2686                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2687         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2688         if (svmCapabilities.value_ == 0)
2689         {
2690             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2691             goto noSVM;
2692         }
2693         try
2694         {
2695             // Try OpenCL 2.0
2696             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2697             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2698             if (!ptr)
2699             {
2700                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2701                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2702             }
2703             try
2704             {
2705                 bool error = false;
2706                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2707                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2708                 {
2709                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2710                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2711                 }
2712                 clFinish(q);
2713                 try
2714                 {
2715                     ((int*)ptr)[0] = 100;
2716                 }
2717                 catch (...)
2718                 {
2719                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2720                     error = true;
2721                 }
2722                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2723                 {
2724                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2725                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2726                 }
2727                 clFinish(q);
2728                 if (error)
2729                 {
2730                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2731                 }
2732             }
2733             catch (...)
2734             {
2735                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2736                 clSVMFree(handle, ptr);
2737                 throw;
2738             }
2739             clSVMFree(handle, ptr);
2740             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2741             svmFunctions.fn_clSVMFree = clSVMFree;
2742             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2743             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2744             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2745             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2746             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2747             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2748             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2749         }
2750         catch (...)
2751         {
2752             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2753             try
2754             {
2755                 // Try HSA extension
2756                 String extensions = device.extensions();
2757                 if (extensions.find("cl_amd_svm") == String::npos)
2758                 {
2759                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2760                     goto noSVM;
2761                 }
2762                 cl_platform_id p = NULL;
2763                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2764                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2765                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2766                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2767                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2768                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2769                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2770                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2771                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2772                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2773                 CV_Assert(svmFunctions.isValid());
2774             }
2775             catch (...)
2776             {
2777                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2778                 goto noSVM;
2779             }
2780         }
2781
2782         svmAvailable = true;
2783         svmEnabled = !svm::checkDisableSVM();
2784         svmInitialized = true;
2785         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2786         return;
2787     noSVM:
2788         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2789         svmAvailable = false;
2790         svmEnabled = false;
2791         svmCapabilities.value_ = 0;
2792         svmInitialized = true;
2793         svmFunctions.fn_clSVMAlloc = NULL;
2794         return;
2795     }
2796
2797     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2798
2799     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2800     {
2801         _init_buffer_pools();
2802         CV_DbgAssert(bufferPoolSVM_);
2803         return *bufferPoolSVM_.get();
2804     }
2805 #endif
2806
2807     friend class Program;
2808 };
2809
2810
2811 Context::Context()
2812 {
2813     p = 0;
2814 }
2815
2816 Context::~Context()
2817 {
2818     release();
2819 }
2820
2821 // deprecated
2822 Context::Context(int dtype)
2823 {
2824     p = 0;
2825     create(dtype);
2826 }
2827
2828 void Context::release()
2829 {
2830     if (p)
2831     {
2832         p->release();
2833         p = NULL;
2834     }
2835 }
2836
2837 bool Context::create()
2838 {
2839     release();
2840     if (!haveOpenCL())
2841         return false;
2842     p = Impl::findOrCreateContext(std::string());
2843     if (p && p->handle)
2844         return true;
2845     release();
2846     return false;
2847 }
2848
2849 // deprecated
2850 bool Context::create(int dtype)
2851 {
2852     if( !haveOpenCL() )
2853         return false;
2854     release();
2855     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2856     {
2857         p = Impl::findOrCreateContext("");
2858     }
2859     else if (dtype == CL_DEVICE_TYPE_GPU)
2860     {
2861         p = Impl::findOrCreateContext(":GPU:");
2862     }
2863     else if (dtype == CL_DEVICE_TYPE_CPU)
2864     {
2865         p = Impl::findOrCreateContext(":CPU:");
2866     }
2867     else
2868     {
2869         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2870     }
2871     if (p && !p->handle)
2872     {
2873         release();
2874     }
2875     return p != 0;
2876 }
2877
2878 Context::Context(const Context& c)
2879 {
2880     p = (Impl*)c.p;
2881     if(p)
2882         p->addref();
2883 }
2884
2885 Context& Context::operator = (const Context& c)
2886 {
2887     Impl* newp = (Impl*)c.p;
2888     if(newp)
2889         newp->addref();
2890     if(p)
2891         p->release();
2892     p = newp;
2893     return *this;
2894 }
2895
2896 void* Context::ptr() const
2897 {
2898     return p == NULL ? NULL : p->handle;
2899 }
2900
2901 size_t Context::ndevices() const
2902 {
2903     return p ? p->devices.size() : 0;
2904 }
2905
2906 Device& Context::device(size_t idx) const
2907 {
2908     static Device dummy;
2909     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2910 }
2911
2912 Context& Context::getDefault(bool initialize)
2913 {
2914     auto& c = OpenCLExecutionContext::getCurrent();
2915     if (!c.empty())
2916     {
2917         auto& ctx = c.getContext();
2918         return ctx;
2919     }
2920
2921     CV_UNUSED(initialize);
2922     static Context dummy;
2923     return dummy;
2924 }
2925
2926 Program Context::getProg(const ProgramSource& prog,
2927                          const String& buildopts, String& errmsg)
2928 {
2929     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2930 }
2931
2932 void Context::unloadProg(Program& prog)
2933 {
2934     if (p)
2935         p->unloadProg(prog);
2936 }
2937
2938 /* static */
2939 Context Context::fromHandle(void* context)
2940 {
2941     Context ctx;
2942     ctx.p = Impl::findOrCreateContext((cl_context)context);
2943     return ctx;
2944 }
2945
2946 /* static */
2947 Context Context::fromDevice(const ocl::Device& device)
2948 {
2949     Context ctx;
2950     ctx.p = Impl::findOrCreateContext(device);
2951     return ctx;
2952 }
2953
2954 /* static */
2955 Context Context::create(const std::string& configuration)
2956 {
2957     Context ctx;
2958     ctx.p = Impl::findOrCreateContext(configuration);
2959     return ctx;
2960 }
2961
2962 #ifdef HAVE_OPENCL_SVM
2963 bool Context::useSVM() const
2964 {
2965     Context::Impl* i = p;
2966     CV_Assert(i);
2967     if (!i->svmInitialized)
2968         i->svmInit();
2969     return i->svmEnabled;
2970 }
2971 void Context::setUseSVM(bool enabled)
2972 {
2973     Context::Impl* i = p;
2974     CV_Assert(i);
2975     if (!i->svmInitialized)
2976         i->svmInit();
2977     if (enabled && !i->svmAvailable)
2978     {
2979         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2980     }
2981     i->svmEnabled = enabled;
2982 }
2983 #else
2984 bool Context::useSVM() const { return false; }
2985 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2986 #endif
2987
2988 #ifdef HAVE_OPENCL_SVM
2989 namespace svm {
2990
2991 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2992 {
2993     Context::Impl* i = context.p;
2994     CV_Assert(i);
2995     if (!i->svmInitialized)
2996         i->svmInit();
2997     return i->svmCapabilities;
2998 }
2999
3000 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3001 {
3002     Context::Impl* i = context.p;
3003     CV_Assert(i);
3004     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3005     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3006     return &i->svmFunctions;
3007 }
3008
3009 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3010 {
3011     if (checkForceSVMUmatUsage())
3012         return true;
3013     if (checkDisableSVMUMatUsage())
3014         return false;
3015     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3016         return true;
3017     return false; // don't use SVM by default
3018 }
3019
3020 } // namespace cv::ocl::svm
3021 #endif // HAVE_OPENCL_SVM
3022
3023
3024 static void get_platform_name(cl_platform_id id, String& name)
3025 {
3026     // get platform name string length
3027     size_t sz = 0;
3028     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3029
3030     // get platform name string
3031     AutoBuffer<char> buf(sz + 1);
3032     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3033
3034     // just in case, ensure trailing zero for ASCIIZ string
3035     buf[sz] = 0;
3036
3037     name = buf.data();
3038 }
3039
3040 /*
3041 // Attaches OpenCL context to OpenCV
3042 */
3043 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3044 {
3045     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3046     ctx.bind();
3047 }
3048
3049 /* static */
3050 OpenCLExecutionContext OpenCLExecutionContext::create(
3051         const std::string& platformName, void* platformID, void* context, void* deviceID
3052 )
3053 {
3054     if (!haveOpenCL())
3055         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3056
3057     cl_uint cnt = 0;
3058     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3059
3060     if (cnt == 0)
3061         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3062
3063     std::vector<cl_platform_id> platforms(cnt);
3064
3065     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3066
3067     bool platformAvailable = false;
3068
3069     // check if external platformName contained in list of available platforms in OpenCV
3070     for (unsigned int i = 0; i < cnt; i++)
3071     {
3072         String availablePlatformName;
3073         get_platform_name(platforms[i], availablePlatformName);
3074         // external platform is found in the list of available platforms
3075         if (platformName == availablePlatformName)
3076         {
3077             platformAvailable = true;
3078             break;
3079         }
3080     }
3081
3082     if (!platformAvailable)
3083         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3084
3085     // check if platformID corresponds to platformName
3086     String actualPlatformName;
3087     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3088     if (platformName != actualPlatformName)
3089         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3090
3091     OpenCLExecutionContext ctx;
3092     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3093     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3094     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3095     return ctx;
3096 }
3097
3098 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3099 {
3100     // internal call, less checks
3101     cl_platform_id platformID = (cl_platform_id)_platform;
3102     cl_context context = (cl_context)_context;
3103     cl_device_id deviceID = (cl_device_id)_device;
3104
3105     std::string platformName = PlatformInfo(&platformID).name();
3106
3107     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3108     CV_Assert(!clExecCtx.empty());
3109     ctx = clExecCtx.getContext();
3110 }
3111
3112 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3113
3114 struct Queue::Impl
3115 {
3116     inline void __init()
3117     {
3118         refcount = 1;
3119         handle = 0;
3120         isProfilingQueue_ = false;
3121     }
3122
3123     Impl(cl_command_queue q)
3124     {
3125         __init();
3126         handle = q;
3127
3128         cl_command_queue_properties props = 0;
3129         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3130         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3131     }
3132
3133     Impl(cl_command_queue q, bool isProfilingQueue)
3134     {
3135         __init();
3136         handle = q;
3137         isProfilingQueue_ = isProfilingQueue;
3138     }
3139
3140     Impl(const Context& c, const Device& d, bool withProfiling = false)
3141     {
3142         __init();
3143
3144         const Context* pc = &c;
3145         cl_context ch = (cl_context)pc->ptr();
3146         if( !ch )
3147         {
3148             pc = &Context::getDefault();
3149             ch = (cl_context)pc->ptr();
3150         }
3151         cl_device_id dh = (cl_device_id)d.ptr();
3152         if( !dh )
3153             dh = (cl_device_id)pc->device(0).ptr();
3154         cl_int retval = 0;
3155         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3156         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3157         isProfilingQueue_ = withProfiling;
3158     }
3159
3160     ~Impl()
3161     {
3162 #ifdef _WIN32
3163         if (!cv::__termination)
3164 #endif
3165         {
3166             if(handle)
3167             {
3168                 CV_OCL_DBG_CHECK(clFinish(handle));
3169                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3170                 handle = NULL;
3171             }
3172         }
3173     }
3174
3175     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3176     {
3177         if (isProfilingQueue_)
3178             return self;
3179
3180         if (profiling_queue_.ptr())
3181             return profiling_queue_;
3182
3183         cl_context ctx = 0;
3184         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3185
3186         cl_device_id device = 0;
3187         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3188
3189         cl_int result = CL_SUCCESS;
3190         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3191         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3192         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3193
3194         Queue queue;
3195         queue.p = new Impl(q, true);
3196         profiling_queue_ = queue;
3197
3198         return profiling_queue_;
3199     }
3200
3201     IMPLEMENT_REFCOUNTABLE();
3202
3203     cl_command_queue handle;
3204     bool isProfilingQueue_;
3205     cv::ocl::Queue profiling_queue_;
3206 };
3207
3208 Queue::Queue()
3209 {
3210     p = 0;
3211 }
3212
3213 Queue::Queue(const Context& c, const Device& d)
3214 {
3215     p = 0;
3216     create(c, d);
3217 }
3218
3219 Queue::Queue(const Queue& q)
3220 {
3221     p = q.p;
3222     if(p)
3223         p->addref();
3224 }
3225
3226 Queue& Queue::operator = (const Queue& q)
3227 {
3228     Impl* newp = (Impl*)q.p;
3229     if(newp)
3230         newp->addref();
3231     if(p)
3232         p->release();
3233     p = newp;
3234     return *this;
3235 }
3236
3237 Queue::~Queue()
3238 {
3239     if(p)
3240         p->release();
3241 }
3242
3243 bool Queue::create(const Context& c, const Device& d)
3244 {
3245     if(p)
3246         p->release();
3247     p = new Impl(c, d);
3248     return p->handle != 0;
3249 }
3250
3251 void Queue::finish()
3252 {
3253     if(p && p->handle)
3254     {
3255         CV_OCL_DBG_CHECK(clFinish(p->handle));
3256     }
3257 }
3258
3259 const Queue& Queue::getProfilingQueue() const
3260 {
3261     CV_Assert(p);
3262     return p->getProfilingQueue(*this);
3263 }
3264
3265 void* Queue::ptr() const
3266 {
3267     return p ? p->handle : 0;
3268 }
3269
3270 Queue& Queue::getDefault()
3271 {
3272     auto& c = OpenCLExecutionContext::getCurrent();
3273     if (!c.empty())
3274     {
3275         auto& q = c.getQueue();
3276         return q;
3277     }
3278     static Queue dummy;
3279     return dummy;
3280 }
3281
3282 static cl_command_queue getQueue(const Queue& q)
3283 {
3284     cl_command_queue qq = (cl_command_queue)q.ptr();
3285     if(!qq)
3286         qq = (cl_command_queue)Queue::getDefault().ptr();
3287     return qq;
3288 }
3289
3290 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3291
3292 KernelArg::KernelArg()
3293     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3294 {
3295 }
3296
3297 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3298     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3299 {
3300     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3301 }
3302
3303 KernelArg KernelArg::Constant(const Mat& m)
3304 {
3305     CV_Assert(m.isContinuous());
3306     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3307 }
3308
3309 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3310
3311 struct Kernel::Impl
3312 {
3313     Impl(const char* kname, const Program& prog) :
3314         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3315     {
3316         cl_program ph = (cl_program)prog.ptr();
3317         cl_int retval = 0;
3318         name = kname;
3319         if (ph)
3320         {
3321             handle = clCreateKernel(ph, kname, &retval);
3322             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3323         }
3324         for( int i = 0; i < MAX_ARRS; i++ )
3325             u[i] = 0;
3326         haveTempDstUMats = false;
3327         haveTempSrcUMats = false;
3328     }
3329
3330     void cleanupUMats()
3331     {
3332         for( int i = 0; i < MAX_ARRS; i++ )
3333             if( u[i] )
3334             {
3335                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3336                 {
3337                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3338                     u[i]->currAllocator->deallocate(u[i]);
3339                 }
3340                 u[i] = 0;
3341             }
3342         nu = 0;
3343         haveTempDstUMats = false;
3344         haveTempSrcUMats = false;
3345     }
3346
3347     void addUMat(const UMat& m, bool dst)
3348     {
3349         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3350         u[nu] = m.u;
3351         CV_XADD(&m.u->urefcount, 1);
3352         nu++;
3353         if(dst && m.u->tempUMat())
3354             haveTempDstUMats = true;
3355         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3356             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3357     }
3358
3359     void addImage(const Image2D& image)
3360     {
3361         images.push_back(image);
3362     }
3363
3364     void finit(cl_event e)
3365     {
3366         CV_UNUSED(e);
3367         cleanupUMats();
3368         images.clear();
3369         isInProgress = false;
3370         release();
3371     }
3372
3373     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3374             bool sync, int64* timeNS, const Queue& q);
3375
3376     ~Impl()
3377     {
3378         if(handle)
3379         {
3380             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3381         }
3382     }
3383
3384     IMPLEMENT_REFCOUNTABLE();
3385
3386     cv::String name;
3387     cl_kernel handle;
3388     enum { MAX_ARRS = 16 };
3389     UMatData* u[MAX_ARRS];
3390     bool isInProgress;
3391     bool isAsyncRun;  // true if kernel was scheduled in async mode
3392     int nu;
3393     std::list<Image2D> images;
3394     bool haveTempDstUMats;
3395     bool haveTempSrcUMats;
3396 };
3397
3398 }} // namespace cv::ocl
3399
3400 extern "C" {
3401
3402 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3403 {
3404     try
3405     {
3406         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3407     }
3408     catch (const cv::Exception& exc)
3409     {
3410         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3411     }
3412     catch (const std::exception& exc)
3413     {
3414         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3415     }
3416     catch (...)
3417     {
3418         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3419     }
3420 }
3421
3422 }
3423
3424 namespace cv { namespace ocl {
3425
3426 Kernel::Kernel()
3427 {
3428     p = 0;
3429 }
3430
3431 Kernel::Kernel(const char* kname, const Program& prog)
3432 {
3433     p = 0;
3434     create(kname, prog);
3435 }
3436
3437 Kernel::Kernel(const char* kname, const ProgramSource& src,
3438                const String& buildopts, String* errmsg)
3439 {
3440     p = 0;
3441     create(kname, src, buildopts, errmsg);
3442 }
3443
3444 Kernel::Kernel(const Kernel& k)
3445 {
3446     p = k.p;
3447     if(p)
3448         p->addref();
3449 }
3450
3451 Kernel& Kernel::operator = (const Kernel& k)
3452 {
3453     Impl* newp = (Impl*)k.p;
3454     if(newp)
3455         newp->addref();
3456     if(p)
3457         p->release();
3458     p = newp;
3459     return *this;
3460 }
3461
3462 Kernel::~Kernel()
3463 {
3464     if(p)
3465         p->release();
3466 }
3467
3468 bool Kernel::create(const char* kname, const Program& prog)
3469 {
3470     if(p)
3471         p->release();
3472     p = new Impl(kname, prog);
3473     if(p->handle == 0)
3474     {
3475         p->release();
3476         p = 0;
3477     }
3478 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3479     CV_Assert(p);
3480 #endif
3481     return p != 0;
3482 }
3483
3484 bool Kernel::create(const char* kname, const ProgramSource& src,
3485                     const String& buildopts, String* errmsg)
3486 {
3487     if(p)
3488     {
3489         p->release();
3490         p = 0;
3491     }
3492     String tempmsg;
3493     if( !errmsg ) errmsg = &tempmsg;
3494     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3495     return create(kname, prog);
3496 }
3497
3498 void* Kernel::ptr() const
3499 {
3500     return p ? p->handle : 0;
3501 }
3502
3503 bool Kernel::empty() const
3504 {
3505     return ptr() == 0;
3506 }
3507
3508 int Kernel::set(int i, const void* value, size_t sz)
3509 {
3510     if (!p || !p->handle)
3511         return -1;
3512     if (i < 0)
3513         return i;
3514     if( i == 0 )
3515         p->cleanupUMats();
3516
3517     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3518     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
3519     if (retval != CL_SUCCESS)
3520         return -1;
3521     return i+1;
3522 }
3523
3524 int Kernel::set(int i, const Image2D& image2D)
3525 {
3526     p->addImage(image2D);
3527     cl_mem h = (cl_mem)image2D.ptr();
3528     return set(i, &h, sizeof(h));
3529 }
3530
3531 int Kernel::set(int i, const UMat& m)
3532 {
3533     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3534 }
3535
3536 int Kernel::set(int i, const KernelArg& arg)
3537 {
3538     if( !p || !p->handle )
3539         return -1;
3540     if (i < 0)
3541     {
3542         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3543                 p->name.c_str(), (int)i));
3544         return i;
3545     }
3546     if( i == 0 )
3547         p->cleanupUMats();
3548     cl_int status = 0;
3549     if( arg.m )
3550     {
3551         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3552                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3553         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3554         if (ptronly && arg.m->empty())
3555         {
3556             cl_mem h_null = (cl_mem)NULL;
3557             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3558             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3559             return i + 1;
3560         }
3561         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3562
3563         if (!h)
3564         {
3565             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3566                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3567             p->release();
3568             p = 0;
3569             return -1;
3570         }
3571
3572 #ifdef HAVE_OPENCL_SVM
3573         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3574         {
3575             const Context& ctx = Context::getDefault();
3576             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3577             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3578             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3579 #if 1 // TODO
3580             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3581 #else
3582             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3583 #endif
3584             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3585         }
3586         else
3587 #endif
3588         {
3589             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3590             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3591         }
3592
3593         if (ptronly)
3594         {
3595             i++;
3596         }
3597         else if( arg.m->dims <= 2 )
3598         {
3599             UMat2D u2d(*arg.m);
3600             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3601             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3602             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3603             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3604             i += 3;
3605
3606             if( !(arg.flags & KernelArg::NO_SIZE) )
3607             {
3608                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3609                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3610                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3611                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3612                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3613                 i += 2;
3614             }
3615         }
3616         else
3617         {
3618             UMat3D u3d(*arg.m);
3619             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3620             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3621             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3622             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3623             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3624             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3625             i += 4;
3626             if( !(arg.flags & KernelArg::NO_SIZE) )
3627             {
3628                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3629                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3630                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3631                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3632                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3633                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3634                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3635                 i += 3;
3636             }
3637         }
3638         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3639         return i;
3640     }
3641     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3642     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3643     return i+1;
3644 }
3645
3646 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3647                  bool sync, const Queue& q)
3648 {
3649     if (!p)
3650         return false;
3651
3652     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3653     size_t total = 1;
3654     CV_Assert(_globalsize != NULL);
3655     for (int i = 0; i < dims; i++)
3656     {
3657         size_t val = _localsize ? _localsize[i] :
3658             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3659         CV_Assert( val > 0 );
3660         total *= _globalsize[i];
3661         if (_globalsize[i] == 1 && !_localsize)
3662             val = 1;
3663         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3664     }
3665     CV_Assert(total > 0);
3666
3667     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3668 }
3669
3670
3671 static bool isRaiseErrorOnReuseAsyncKernel()
3672 {
3673     static bool initialized = false;
3674     static bool value = false;
3675     if (!initialized)
3676     {
3677         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3678         initialized = true;
3679     }
3680     return value;
3681 }
3682
3683 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3684         bool sync, int64* timeNS, const Queue& q)
3685 {
3686     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3687
3688     if (!handle)
3689     {
3690         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3691         return false;
3692     }
3693
3694     if (isAsyncRun)
3695     {
3696         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3697         if (isRaiseErrorOnReuseAsyncKernel())
3698             CV_Assert(0);
3699         return false;  // OpenCV 5.0: raise error
3700     }
3701     isAsyncRun = !sync;
3702
3703     if (isInProgress)
3704     {
3705         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3706         if (isRaiseErrorOnReuseAsyncKernel())
3707             CV_Assert(0);
3708         return false;  // OpenCV 5.0: raise error
3709     }
3710
3711     cl_command_queue qq = getQueue(q);
3712     if (haveTempDstUMats)
3713         sync = true;
3714     if (haveTempSrcUMats)
3715         sync = true;
3716     if (timeNS)
3717         sync = true;
3718     cl_event asyncEvent = 0;
3719     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3720                                            NULL, globalsize, localsize, 0, 0,
3721                                            (sync && !timeNS) ? 0 : &asyncEvent);
3722 #if !CV_OPENCL_SHOW_RUN_KERNELS
3723     if (retval != CL_SUCCESS)
3724 #endif
3725     {
3726         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3727                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3728                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3729                         sync ? "true" : "false"
3730                         );
3731         if (retval != CL_SUCCESS)
3732         {
3733             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3734         }
3735 #if CV_OPENCL_TRACE_CHECK
3736         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3737 #else
3738         printf("%s\n", msg.c_str());
3739         fflush(stdout);
3740 #endif
3741     }
3742     if (sync || retval != CL_SUCCESS)
3743     {
3744         CV_OCL_DBG_CHECK(clFinish(qq));
3745         if (timeNS)
3746         {
3747             if (retval == CL_SUCCESS)
3748             {
3749                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3750                 cl_ulong startTime, stopTime;
3751                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3752                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3753                 *timeNS = (int64)(stopTime - startTime);
3754             }
3755             else
3756             {
3757                 *timeNS = -1;
3758             }
3759         }
3760         cleanupUMats();
3761     }
3762     else
3763     {
3764         addref();
3765         isInProgress = true;
3766         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3767     }
3768     if (asyncEvent)
3769         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3770     return retval == CL_SUCCESS;
3771 }
3772
3773 bool Kernel::runTask(bool sync, const Queue& q)
3774 {
3775     if(!p || !p->handle || p->isInProgress)
3776         return false;
3777
3778     cl_command_queue qq = getQueue(q);
3779     cl_event asyncEvent = 0;
3780     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3781     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3782     if (sync || retval != CL_SUCCESS)
3783     {
3784         CV_OCL_DBG_CHECK(clFinish(qq));
3785         p->cleanupUMats();
3786     }
3787     else
3788     {
3789         p->addref();
3790         p->isInProgress = true;
3791         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3792     }
3793     if (asyncEvent)
3794         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3795     return retval == CL_SUCCESS;
3796 }
3797
3798 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3799 {
3800     CV_Assert(p && p->handle && !p->isInProgress);
3801     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3802     CV_Assert(q.ptr());
3803     q.finish(); // call clFinish() on base queue
3804     Queue profilingQueue = q.getProfilingQueue();
3805     int64 timeNs = -1;
3806     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3807     return res ? timeNs : -1;
3808 }
3809
3810 size_t Kernel::workGroupSize() const
3811 {
3812     if(!p || !p->handle)
3813         return 0;
3814     size_t val = 0, retsz = 0;
3815     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3816     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3817     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3818     return status == CL_SUCCESS ? val : 0;
3819 }
3820
3821 size_t Kernel::preferedWorkGroupSizeMultiple() const
3822 {
3823     if(!p || !p->handle)
3824         return 0;
3825     size_t val = 0, retsz = 0;
3826     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3827     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3828     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3829     return status == CL_SUCCESS ? val : 0;
3830 }
3831
3832 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3833 {
3834     if(!p || !p->handle || !wsz)
3835         return 0;
3836     size_t retsz = 0;
3837     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3838     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3839     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3840     return status == CL_SUCCESS;
3841 }
3842
3843 size_t Kernel::localMemSize() const
3844 {
3845     if(!p || !p->handle)
3846         return 0;
3847     size_t retsz = 0;
3848     cl_ulong val = 0;
3849     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3850     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3851     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3852     return status == CL_SUCCESS ? (size_t)val : 0;
3853 }
3854
3855
3856
3857 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3858
3859 struct ProgramSource::Impl
3860 {
3861     IMPLEMENT_REFCOUNTABLE();
3862
3863     enum KIND {
3864         PROGRAM_SOURCE_CODE = 0,
3865         PROGRAM_BINARIES,
3866         PROGRAM_SPIR,
3867         PROGRAM_SPIRV
3868     } kind_;
3869
3870     Impl(const String& src)
3871     {
3872         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3873         initFromSource(src, cv::String());
3874     }
3875     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3876     {
3877         init(PROGRAM_SOURCE_CODE, module, name);
3878         initFromSource(codeStr, codeHash);
3879     }
3880
3881     /// reset fields
3882     void init(enum KIND kind, const String& module, const String& name)
3883     {
3884         refcount = 1;
3885         kind_ = kind;
3886         module_ = module;
3887         name_ = name;
3888
3889         sourceAddr_ = NULL;
3890         sourceSize_ = 0;
3891         isHashUpdated = false;
3892     }
3893
3894     void initFromSource(const String& codeStr, const String& codeHash)
3895     {
3896         codeStr_ = codeStr;
3897         sourceHash_ = codeHash;
3898         if (sourceHash_.empty())
3899         {
3900             updateHash();
3901         }
3902         else
3903         {
3904             isHashUpdated = true;
3905         }
3906     }
3907
3908     void updateHash(const char* hashStr = NULL)
3909     {
3910         if (hashStr)
3911         {
3912             sourceHash_ = cv::String(hashStr);
3913             isHashUpdated = true;
3914             return;
3915         }
3916         uint64 hash = 0;
3917         switch (kind_)
3918         {
3919         case PROGRAM_SOURCE_CODE:
3920             if (sourceAddr_)
3921             {
3922                 CV_Assert(codeStr_.empty());
3923                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3924             }
3925             else
3926             {
3927                 CV_Assert(!codeStr_.empty());
3928                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3929             }
3930             break;
3931         case PROGRAM_BINARIES:
3932         case PROGRAM_SPIR:
3933         case PROGRAM_SPIRV:
3934             hash = crc64(sourceAddr_, sourceSize_);
3935             break;
3936         default:
3937             CV_Error(Error::StsInternal, "Internal error");
3938         }
3939         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3940         isHashUpdated = true;
3941     }
3942
3943     Impl(enum KIND kind,
3944             const String& module, const String& name,
3945             const unsigned char* binary, const size_t size,
3946             const cv::String& buildOptions = cv::String())
3947     {
3948         init(kind, module, name);
3949
3950         sourceAddr_ = binary;
3951         sourceSize_ = size;
3952
3953         buildOptions_ = buildOptions;
3954     }
3955
3956     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3957             const char* sourceCodeStaticStr, const char* hashStaticStr,
3958             const cv::String& buildOptions)
3959     {
3960         ProgramSource result;
3961         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3962                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3963         result.p->updateHash(hashStaticStr);
3964         return result;
3965     }
3966
3967     static ProgramSource fromBinary(const String& module, const String& name,
3968             const unsigned char* binary, const size_t size,
3969             const cv::String& buildOptions)
3970     {
3971         ProgramSource result;
3972         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3973         return result;
3974     }
3975
3976     static ProgramSource fromSPIR(const String& module, const String& name,
3977             const unsigned char* binary, const size_t size,
3978             const cv::String& buildOptions)
3979     {
3980         ProgramSource result;
3981         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3982         return result;
3983     }
3984
3985     String module_;
3986     String name_;
3987
3988     // TODO std::vector<ProgramSource> includes_;
3989     String codeStr_; // PROGRAM_SOURCE_CODE only
3990
3991     const unsigned char* sourceAddr_;
3992     size_t sourceSize_;
3993
3994     cv::String buildOptions_;
3995
3996     String sourceHash_;
3997     bool isHashUpdated;
3998
3999     friend struct Program::Impl;
4000     friend struct internal::ProgramEntry;
4001     friend struct Context::Impl;
4002 };
4003
4004
4005 ProgramSource::ProgramSource()
4006 {
4007     p = 0;
4008 }
4009
4010 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4011 {
4012     p = new Impl(module, name, codeStr, codeHash);
4013 }
4014
4015 ProgramSource::ProgramSource(const char* prog)
4016 {
4017     p = new Impl(prog);
4018 }
4019
4020 ProgramSource::ProgramSource(const String& prog)
4021 {
4022     p = new Impl(prog);
4023 }
4024
4025 ProgramSource::~ProgramSource()
4026 {
4027     if(p)
4028         p->release();
4029 }
4030
4031 ProgramSource::ProgramSource(const ProgramSource& prog)
4032 {
4033     p = prog.p;
4034     if(p)
4035         p->addref();
4036 }
4037
4038 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4039 {
4040     Impl* newp = (Impl*)prog.p;
4041     if(newp)
4042         newp->addref();
4043     if(p)
4044         p->release();
4045     p = newp;
4046     return *this;
4047 }
4048
4049 const String& ProgramSource::source() const
4050 {
4051     CV_Assert(p);
4052     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4053     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4054     return p->codeStr_;
4055 }
4056
4057 ProgramSource::hash_t ProgramSource::hash() const
4058 {
4059     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4060 }
4061
4062 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4063         const unsigned char* binary, const size_t size,
4064         const cv::String& buildOptions)
4065 {
4066     CV_Assert(binary);
4067     CV_Assert(size > 0);
4068     return Impl::fromBinary(module, name, binary, size, buildOptions);
4069 }
4070
4071 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4072         const unsigned char* binary, const size_t size,
4073         const cv::String& buildOptions)
4074 {
4075     CV_Assert(binary);
4076     CV_Assert(size > 0);
4077     return Impl::fromBinary(module, name, binary, size, buildOptions);
4078 }
4079
4080
4081 internal::ProgramEntry::operator ProgramSource&() const
4082 {
4083     if (this->pProgramSource == NULL)
4084     {
4085         cv::AutoLock lock(cv::getInitializationMutex());
4086         if (this->pProgramSource == NULL)
4087         {
4088             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4089             ProgramSource* ptr = new ProgramSource(ps);
4090             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4091         }
4092     }
4093     return *this->pProgramSource;
4094 }
4095
4096
4097
4098 /////////////////////////////////////////// Program /////////////////////////////////////////////
4099
4100 static
4101 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4102 {
4103     if (b.empty())
4104         return a;
4105     if (a.empty())
4106         return b;
4107     if (b[0] == ' ')
4108         return a + b;
4109     return a + (cv::String(" ") + b);
4110 }
4111
4112 struct Program::Impl
4113 {
4114     IMPLEMENT_REFCOUNTABLE();
4115
4116     Impl(const ProgramSource& src,
4117          const String& _buildflags, String& errmsg) :
4118          refcount(1),
4119          handle(NULL),
4120          buildflags(_buildflags)
4121     {
4122         const ProgramSource::Impl* src_ = src.getImpl();
4123         CV_Assert(src_);
4124         sourceModule_ = src_->module_;
4125         sourceName_ = src_->name_;
4126         const Context ctx = Context::getDefault();
4127         Device device = ctx.device(0);
4128         if (ctx.ptr() == NULL || device.ptr() == NULL)
4129             return;
4130         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4131         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4132         {
4133             if (device.isAMD())
4134                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4135             else if (device.isIntel())
4136                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4137             const String param_buildExtraOptions = getBuildExtraOptions();
4138             if (!param_buildExtraOptions.empty())
4139                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4140         }
4141         compile(ctx, src_, errmsg);
4142     }
4143
4144     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4145     {
4146         CV_Assert(ctx.getImpl());
4147         CV_Assert(src_);
4148
4149         // We don't cache OpenCL binaries
4150         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4151         {
4152             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4153             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4154             return isLoaded;
4155         }
4156         return compileWithCache(ctx, src_, errmsg);
4157     }
4158
4159     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4160     {
4161         CV_Assert(ctx.getImpl());
4162         CV_Assert(src_);
4163         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4164
4165 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4166         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4167         const std::string base_dir = config.prepareCacheDirectoryForContext(
4168                 ctx.getImpl()->getPrefixString(),
4169                 ctx.getImpl()->getPrefixBase()
4170         );
4171         const String& hash_str = src_->sourceHash_;
4172         cv::String fname;
4173         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4174         {
4175             CV_Assert(!hash_str.empty());
4176             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4177             fname = utils::fs::join(base_dir, fname);
4178         }
4179         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4180         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4181         {
4182             try
4183             {
4184                 std::vector<char> binaryBuf;
4185                 bool res = false;
4186                 {
4187                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4188                     BinaryProgramFile file(fname, hash_str.c_str());
4189                     res = file.read(buildflags, binaryBuf);
4190                 }
4191                 if (res)
4192                 {
4193                     CV_Assert(!binaryBuf.empty());
4194                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4195                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4196                     if (isLoaded)
4197                         return true;
4198                 }
4199             }
4200             catch (const cv::Exception& e)
4201             {
4202                 CV_UNUSED(e);
4203                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4204             }
4205             catch (...)
4206             {
4207                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4208             }
4209         }
4210 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4211         CV_Assert(handle == NULL);
4212         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4213         {
4214             if (!buildFromSources(ctx, src_, errmsg))
4215             {
4216                 return false;
4217             }
4218         }
4219         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4220         {
4221             buildflags = joinBuildOptions(buildflags, " -x spir");
4222             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4223             {
4224                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4225             }
4226             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4227             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4228             if (!isLoaded)
4229                 return false;
4230         }
4231         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4232         {
4233             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4234         }
4235         else
4236         {
4237             CV_Error(Error::StsInternal, "Internal error");
4238         }
4239         CV_Assert(handle != NULL);
4240 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4241         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4242         {
4243             try
4244             {
4245                 std::vector<char> binaryBuf;
4246                 getProgramBinary(binaryBuf);
4247                 {
4248                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4249                     BinaryProgramFile file(fname, hash_str.c_str());
4250                     file.write(buildflags, binaryBuf);
4251                 }
4252             }
4253             catch (const cv::Exception& e)
4254             {
4255                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4256             }
4257             catch (...)
4258             {
4259                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4260             }
4261         }
4262 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4263 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4264         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4265         {
4266             std::vector<char> binaryBuf;
4267             getProgramBinary(binaryBuf);
4268             if (!binaryBuf.empty())
4269             {
4270                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4271                 handle = NULL;
4272                 createFromBinary(ctx, binaryBuf, errmsg);
4273             }
4274         }
4275 #endif
4276         return handle != NULL;
4277     }
4278
4279     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4280     {
4281         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4282
4283         size_t retsz = 0;
4284         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4285                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4286         if (log_retval == CL_SUCCESS && retsz > 1)
4287         {
4288             buffer.resize(retsz + 16);
4289             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4290                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4291             if (log_retval == CL_SUCCESS)
4292             {
4293                 if (retsz < buffer.size())
4294                     buffer[retsz] = 0;
4295                 else
4296                     buffer[buffer.size() - 1] = 0;
4297             }
4298             else
4299             {
4300                 buffer[0] = 0;
4301             }
4302         }
4303
4304         errmsg = String(buffer.data());
4305         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4306                 sourceModule_.c_str(), sourceName_.c_str(),
4307                 result, getOpenCLErrorString(result),
4308                 buildflags.c_str(), errmsg.c_str());
4309         fflush(stdout);
4310     }
4311
4312     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4313     {
4314         CV_Assert(src_);
4315         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4316         CV_Assert(handle == NULL);
4317         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4318                 sourceModule_.c_str(), sourceName_.c_str(),
4319                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4320
4321         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4322
4323         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4324         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4325         CV_Assert(srcptr != NULL);
4326         CV_Assert(srclen > 0);
4327
4328         cl_int retval = 0;
4329
4330         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4331         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4332         CV_Assert(handle || retval != CL_SUCCESS);
4333         if (handle && retval == CL_SUCCESS)
4334         {
4335             size_t n = ctx.ndevices();
4336             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4337             cl_device_id* deviceList = deviceListBuf.data();
4338             for (size_t i = 0; i < n; i++)
4339             {
4340                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4341             }
4342
4343             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4344             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4345 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4346             if (retval != CL_SUCCESS)
4347 #endif
4348             {
4349                 dumpBuildLog_(retval, deviceList, errmsg);
4350
4351                 // don't remove "retval != CL_SUCCESS" condition here:
4352                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4353                 if (retval != CL_SUCCESS && handle)
4354                 {
4355                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4356                     handle = NULL;
4357                 }
4358             }
4359 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4360             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4361             {
4362                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4363                 size_t retsz = 0;
4364                 char kernels_buffer[4096] = {0};
4365                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4366                 if (retsz < sizeof(kernels_buffer))
4367                     kernels_buffer[retsz] = 0;
4368                 else
4369                     kernels_buffer[0] = 0;
4370                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4371             }
4372 #endif
4373
4374         }
4375         return handle != NULL;
4376     }
4377
4378     void getProgramBinary(std::vector<char>& buf)
4379     {
4380         CV_Assert(handle);
4381         size_t sz = 0;
4382         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4383         buf.resize(sz);
4384         uchar* ptr = (uchar*)&buf[0];
4385         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4386     }
4387
4388     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4389     {
4390         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4391     }
4392
4393     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4394     {
4395         CV_Assert(handle == NULL);
4396         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4397         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4398
4399         CV_Assert(binarySize > 0);
4400
4401         size_t ndevices = (int)ctx.ndevices();
4402         AutoBuffer<cl_device_id> devices_(ndevices);
4403         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4404         AutoBuffer<size_t> binarySizes_(ndevices);
4405
4406         cl_device_id* devices = devices_.data();
4407         const uchar** binaryPtrs = binaryPtrs_.data();
4408         size_t* binarySizes = binarySizes_.data();
4409         for (size_t i = 0; i < ndevices; i++)
4410         {
4411             devices[i] = (cl_device_id)ctx.device(i).ptr();
4412             binaryPtrs[i] = binaryAddr;
4413             binarySizes[i] = binarySize;
4414         }
4415
4416         cl_int result = 0;
4417         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4418                                            binarySizes, binaryPtrs, NULL, &result);
4419         if (result != CL_SUCCESS)
4420         {
4421             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4422             if (handle)
4423             {
4424                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4425                 handle = NULL;
4426             }
4427         }
4428         if (!handle)
4429         {
4430             return false;
4431         }
4432         // call clBuildProgram()
4433         {
4434             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4435             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4436             if (result != CL_SUCCESS)
4437             {
4438                 dumpBuildLog_(result, devices, errmsg);
4439                 if (handle)
4440                 {
4441                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4442                     handle = NULL;
4443                 }
4444                 return false;
4445             }
4446         }
4447         // check build status
4448         {
4449             cl_build_status build_status = CL_BUILD_NONE;
4450             size_t retsz = 0;
4451             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4452                     sizeof(build_status), &build_status, &retsz));
4453             if (result == CL_SUCCESS)
4454             {
4455                 if (build_status == CL_BUILD_SUCCESS)
4456                 {
4457                     return true;
4458                 }
4459                 else
4460                 {
4461                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4462                     return false;
4463                 }
4464             }
4465             else
4466             {
4467                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4468                 if (handle)
4469                 {
4470                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4471                     handle = NULL;
4472                 }
4473             }
4474         }
4475 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4476         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4477         {
4478             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4479             size_t retsz = 0;
4480             char kernels_buffer[4096] = {0};
4481             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4482             if (retsz < sizeof(kernels_buffer))
4483                 kernels_buffer[retsz] = 0;
4484             else
4485                 kernels_buffer[0] = 0;
4486             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4487         }
4488 #endif
4489         return handle != NULL;
4490     }
4491
4492     ~Impl()
4493     {
4494         if( handle )
4495         {
4496 #ifdef _WIN32
4497             if (!cv::__termination)
4498 #endif
4499             {
4500                 clReleaseProgram(handle);
4501             }
4502             handle = NULL;
4503         }
4504     }
4505
4506     cl_program handle;
4507
4508     String buildflags;
4509     String sourceModule_;
4510     String sourceName_;
4511 };
4512
4513
4514 Program::Program() { p = 0; }
4515
4516 Program::Program(const ProgramSource& src,
4517         const String& buildflags, String& errmsg)
4518 {
4519     p = 0;
4520     create(src, buildflags, errmsg);
4521 }
4522
4523 Program::Program(const Program& prog)
4524 {
4525     p = prog.p;
4526     if(p)
4527         p->addref();
4528 }
4529
4530 Program& Program::operator = (const Program& prog)
4531 {
4532     Impl* newp = (Impl*)prog.p;
4533     if(newp)
4534         newp->addref();
4535     if(p)
4536         p->release();
4537     p = newp;
4538     return *this;
4539 }
4540
4541 Program::~Program()
4542 {
4543     if(p)
4544         p->release();
4545 }
4546
4547 bool Program::create(const ProgramSource& src,
4548             const String& buildflags, String& errmsg)
4549 {
4550     if(p)
4551     {
4552         p->release();
4553         p = NULL;
4554     }
4555     p = new Impl(src, buildflags, errmsg);
4556     if(!p->handle)
4557     {
4558         p->release();
4559         p = 0;
4560     }
4561     return p != 0;
4562 }
4563
4564 void* Program::ptr() const
4565 {
4566     return p ? p->handle : 0;
4567 }
4568
4569 #ifndef OPENCV_REMOVE_DEPRECATED_API
4570 const ProgramSource& Program::source() const
4571 {
4572     CV_Error(Error::StsNotImplemented, "Removed API");
4573 }
4574
4575 bool Program::read(const String& bin, const String& buildflags)
4576 {
4577     CV_UNUSED(bin); CV_UNUSED(buildflags);
4578     CV_Error(Error::StsNotImplemented, "Removed API");
4579 }
4580
4581 bool Program::write(String& bin) const
4582 {
4583     CV_UNUSED(bin);
4584     CV_Error(Error::StsNotImplemented, "Removed API");
4585 }
4586
4587 String Program::getPrefix() const
4588 {
4589     if(!p)
4590         return String();
4591     Context::Impl* ctx_ = Context::getDefault().getImpl();
4592     CV_Assert(ctx_);
4593     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4594 }
4595
4596 String Program::getPrefix(const String& buildflags)
4597 {
4598         Context::Impl* ctx_ = Context::getDefault().getImpl();
4599         CV_Assert(ctx_);
4600         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4601 }
4602 #endif // OPENCV_REMOVE_DEPRECATED_API
4603
4604 void Program::getBinary(std::vector<char>& binary) const
4605 {
4606     CV_Assert(p && "Empty program");
4607     p->getProgramBinary(binary);
4608 }
4609
4610 Program Context::Impl::getProg(const ProgramSource& src,
4611                                const String& buildflags, String& errmsg)
4612 {
4613     size_t limit = getProgramCountLimit();
4614     const ProgramSource::Impl* src_ = src.getImpl();
4615     CV_Assert(src_);
4616     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4617             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4618             getPrefixString().c_str(),
4619             buildflags.c_str());
4620     {
4621         cv::AutoLock lock(program_cache_mutex);
4622         phash_t::iterator it = phash.find(key);
4623         if (it != phash.end())
4624         {
4625             // TODO LRU cache
4626             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4627             if (i != cacheList.end() && i != cacheList.begin())
4628             {
4629                 cacheList.erase(i);
4630                 cacheList.push_front(key);
4631             }
4632             return it->second;
4633         }
4634         { // cleanup program cache
4635             size_t sz = phash.size();
4636             if (limit > 0 && sz >= limit)
4637             {
4638                 static bool warningFlag = false;
4639                 if (!warningFlag)
4640                 {
4641                     printf("\nWARNING: OpenCV-OpenCL:\n"
4642                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4643                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4644                     warningFlag = true;
4645                 }
4646                 while (!cacheList.empty())
4647                 {
4648                     size_t c = phash.erase(cacheList.back());
4649                     cacheList.pop_back();
4650                     if (c != 0)
4651                         break;
4652                 }
4653             }
4654         }
4655     }
4656     Program prog(src, buildflags, errmsg);
4657     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4658     {
4659         cv::AutoLock lock(program_cache_mutex);
4660         phash.insert(std::pair<std::string, Program>(key, prog));
4661         cacheList.push_front(key);
4662     }
4663     return prog;
4664 }
4665
4666
4667 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4668
4669 template<typename T>
4670 class OpenCLBufferPool
4671 {
4672 protected:
4673     ~OpenCLBufferPool() { }
4674 public:
4675     virtual T allocate(size_t size) = 0;
4676     virtual void release(T buffer) = 0;
4677 };
4678
4679 template <typename Derived, typename BufferEntry, typename T>
4680 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4681 {
4682 private:
4683     inline Derived& derived() { return *static_cast<Derived*>(this); }
4684 protected:
4685     Mutex mutex_;
4686
4687     size_t currentReservedSize;
4688     size_t maxReservedSize;
4689
4690     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4691     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4692
4693     // synchronized
4694     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4695     {
4696         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4697         for (; i != allocatedEntries_.end(); ++i)
4698         {
4699             BufferEntry& e = *i;
4700             if (e.clBuffer_ == buffer)
4701             {
4702                 entry = e;
4703                 allocatedEntries_.erase(i);
4704                 return true;
4705             }
4706         }
4707         return false;
4708     }
4709
4710     // synchronized
4711     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4712     {
4713         if (reservedEntries_.empty())
4714             return false;
4715         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4716         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4717         BufferEntry result;
4718         size_t minDiff = (size_t)(-1);
4719         for (; i != reservedEntries_.end(); ++i)
4720         {
4721             BufferEntry& e = *i;
4722             if (e.capacity_ >= size)
4723             {
4724                 size_t diff = e.capacity_ - size;
4725                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4726                 {
4727                     minDiff = diff;
4728                     result_pos = i;
4729                     result = e;
4730                     if (diff == 0)
4731                         break;
4732                 }
4733             }
4734         }
4735         if (result_pos != reservedEntries_.end())
4736         {
4737             //CV_DbgAssert(result == *result_pos);
4738             reservedEntries_.erase(result_pos);
4739             entry = result;
4740             currentReservedSize -= entry.capacity_;
4741             allocatedEntries_.push_back(entry);
4742             return true;
4743         }
4744         return false;
4745     }
4746
4747     // synchronized
4748     void _checkSizeOfReservedEntries()
4749     {
4750         while (currentReservedSize > maxReservedSize)
4751         {
4752             CV_DbgAssert(!reservedEntries_.empty());
4753             const BufferEntry& entry = reservedEntries_.back();
4754             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4755             currentReservedSize -= entry.capacity_;
4756             derived()._releaseBufferEntry(entry);
4757             reservedEntries_.pop_back();
4758         }
4759     }
4760
4761     inline size_t _allocationGranularity(size_t size)
4762     {
4763         // heuristic values
4764         if (size < 1024*1024)
4765             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4766         else if (size < 16*1024*1024)
4767             return 64*1024;
4768         else
4769             return 1024*1024;
4770     }
4771
4772 public:
4773     OpenCLBufferPoolBaseImpl()
4774         : currentReservedSize(0),
4775           maxReservedSize(0)
4776     {
4777         // nothing
4778     }
4779     virtual ~OpenCLBufferPoolBaseImpl()
4780     {
4781         freeAllReservedBuffers();
4782         CV_Assert(reservedEntries_.empty());
4783     }
4784 public:
4785     virtual T allocate(size_t size) CV_OVERRIDE
4786     {
4787         AutoLock locker(mutex_);
4788         BufferEntry entry;
4789         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4790         {
4791             CV_DbgAssert(size <= entry.capacity_);
4792             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4793         }
4794         else
4795         {
4796             derived()._allocateBufferEntry(entry, size);
4797         }
4798         return entry.clBuffer_;
4799     }
4800     virtual void release(T buffer) CV_OVERRIDE
4801     {
4802         AutoLock locker(mutex_);
4803         BufferEntry entry;
4804         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4805         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4806         {
4807             derived()._releaseBufferEntry(entry);
4808         }
4809         else
4810         {
4811             reservedEntries_.push_front(entry);
4812             currentReservedSize += entry.capacity_;
4813             _checkSizeOfReservedEntries();
4814         }
4815     }
4816
4817     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4818     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4819     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4820     {
4821         AutoLock locker(mutex_);
4822         size_t oldMaxReservedSize = maxReservedSize;
4823         maxReservedSize = size;
4824         if (maxReservedSize < oldMaxReservedSize)
4825         {
4826             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4827             for (; i != reservedEntries_.end();)
4828             {
4829                 const BufferEntry& entry = *i;
4830                 if (entry.capacity_ > maxReservedSize / 8)
4831                 {
4832                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4833                     currentReservedSize -= entry.capacity_;
4834                     derived()._releaseBufferEntry(entry);
4835                     i = reservedEntries_.erase(i);
4836                     continue;
4837                 }
4838                 ++i;
4839             }
4840             _checkSizeOfReservedEntries();
4841         }
4842     }
4843     virtual void freeAllReservedBuffers() CV_OVERRIDE
4844     {
4845         AutoLock locker(mutex_);
4846         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4847         for (; i != reservedEntries_.end(); ++i)
4848         {
4849             const BufferEntry& entry = *i;
4850             derived()._releaseBufferEntry(entry);
4851         }
4852         reservedEntries_.clear();
4853         currentReservedSize = 0;
4854     }
4855 };
4856
4857 struct CLBufferEntry
4858 {
4859     cl_mem clBuffer_;
4860     size_t capacity_;
4861     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4862 };
4863
4864 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4865 {
4866 public:
4867     typedef struct CLBufferEntry BufferEntry;
4868 protected:
4869     int createFlags_;
4870 public:
4871     OpenCLBufferPoolImpl(int createFlags = 0)
4872         : createFlags_(createFlags)
4873     {
4874     }
4875
4876     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4877     {
4878         CV_DbgAssert(entry.clBuffer_ == NULL);
4879         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4880         Context& ctx = Context::getDefault();
4881         cl_int retval = CL_SUCCESS;
4882         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4883         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4884         CV_Assert(entry.clBuffer_ != NULL);
4885         if(retval == CL_SUCCESS)
4886         {
4887             CV_IMPL_ADD(CV_IMPL_OCL);
4888         }
4889         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4890                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4891         allocatedEntries_.push_back(entry);
4892     }
4893
4894     void _releaseBufferEntry(const BufferEntry& entry)
4895     {
4896         CV_Assert(entry.capacity_ != 0);
4897         CV_Assert(entry.clBuffer_ != NULL);
4898         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4899                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4900         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4901     }
4902 };
4903
4904 #ifdef HAVE_OPENCL_SVM
4905 struct CLSVMBufferEntry
4906 {
4907     void* clBuffer_;
4908     size_t capacity_;
4909     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4910 };
4911 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4912 {
4913 public:
4914     typedef struct CLSVMBufferEntry BufferEntry;
4915 public:
4916     OpenCLSVMBufferPoolImpl()
4917     {
4918     }
4919
4920     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4921     {
4922         CV_DbgAssert(entry.clBuffer_ == NULL);
4923         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4924
4925         Context& ctx = Context::getDefault();
4926         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4927         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4928         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4929                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4930
4931         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4932         CV_DbgAssert(svmFns->isValid());
4933
4934         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4935         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4936         CV_Assert(buf);
4937
4938         entry.clBuffer_ = buf;
4939         {
4940             CV_IMPL_ADD(CV_IMPL_OCL);
4941         }
4942         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4943                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4944         allocatedEntries_.push_back(entry);
4945     }
4946
4947     void _releaseBufferEntry(const BufferEntry& entry)
4948     {
4949         CV_Assert(entry.capacity_ != 0);
4950         CV_Assert(entry.clBuffer_ != NULL);
4951         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4952                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4953         Context& ctx = Context::getDefault();
4954         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4955         CV_DbgAssert(svmFns->isValid());
4956         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4957         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4958     }
4959 };
4960 #endif
4961
4962
4963
4964 template <bool readAccess, bool writeAccess>
4965 class AlignedDataPtr
4966 {
4967 protected:
4968     const size_t size_;
4969     uchar* const originPtr_;
4970     const size_t alignment_;
4971     uchar* ptr_;
4972     uchar* allocatedPtr_;
4973
4974 public:
4975     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4976         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4977     {
4978         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4979         CV_DbgAssert(!readAccess || ptr);
4980         if (((size_t)ptr_ & (alignment - 1)) != 0)
4981         {
4982             allocatedPtr_ = new uchar[size_ + alignment - 1];
4983             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4984             if (readAccess)
4985             {
4986                 memcpy(ptr_, originPtr_, size_);
4987             }
4988         }
4989     }
4990
4991     uchar* getAlignedPtr() const
4992     {
4993         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4994         return ptr_;
4995     }
4996
4997     ~AlignedDataPtr()
4998     {
4999         if (allocatedPtr_)
5000         {
5001             if (writeAccess)
5002             {
5003                 memcpy(originPtr_, ptr_, size_);
5004             }
5005             delete[] allocatedPtr_;
5006             allocatedPtr_ = NULL;
5007         }
5008         ptr_ = NULL;
5009     }
5010 private:
5011     AlignedDataPtr(const AlignedDataPtr&); // disabled
5012     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5013 };
5014
5015 template <bool readAccess, bool writeAccess>
5016 class AlignedDataPtr2D
5017 {
5018 protected:
5019     const size_t size_;
5020     uchar* const originPtr_;
5021     const size_t alignment_;
5022     uchar* ptr_;
5023     uchar* allocatedPtr_;
5024     size_t rows_;
5025     size_t cols_;
5026     size_t step_;
5027
5028 public:
5029     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5030         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5031     {
5032         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5033         CV_DbgAssert(!readAccess || ptr != NULL);
5034         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5035         {
5036             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5037             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5038             if (readAccess)
5039             {
5040                 for (size_t i = 0; i < rows_; i++)
5041                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5042             }
5043         }
5044     }
5045
5046     uchar* getAlignedPtr() const
5047     {
5048         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5049         return ptr_;
5050     }
5051
5052     ~AlignedDataPtr2D()
5053     {
5054         if (allocatedPtr_)
5055         {
5056             if (writeAccess)
5057             {
5058                 for (size_t i = 0; i < rows_; i++)
5059                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5060             }
5061             delete[] allocatedPtr_;
5062             allocatedPtr_ = NULL;
5063         }
5064         ptr_ = NULL;
5065     }
5066 private:
5067     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5068     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5069 };
5070
5071 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5072 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5073 #endif
5074
5075
5076 void Context::Impl::__init_buffer_pools()
5077 {
5078     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5079     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5080     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5081     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5082
5083     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5084     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5085     bufferPool.setMaxReservedSize(poolSize);
5086     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5087     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5088
5089 #ifdef HAVE_OPENCL_SVM
5090     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5091     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5092     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5093     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5094 #endif
5095
5096     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5097 }
5098
5099 class OpenCLAllocator CV_FINAL : public MatAllocator
5100 {
5101 public:
5102     enum AllocatorFlags
5103     {
5104         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5105         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5106 #ifdef HAVE_OPENCL_SVM
5107         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5108 #endif
5109         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5110     };
5111
5112     OpenCLAllocator()
5113     {
5114         matStdAllocator = Mat::getDefaultAllocator();
5115     }
5116     ~OpenCLAllocator()
5117     {
5118         flushCleanupQueue();
5119     }
5120
5121     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5122             AccessFlag flags, UMatUsageFlags usageFlags) const
5123     {
5124         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5125         return u;
5126     }
5127
5128     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5129     {
5130         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5131         return value;
5132     }
5133     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5134     {
5135         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5136         return value;
5137     }
5138
5139     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5140     {
5141         const Device& dev = ctx.device(0);
5142         createFlags = 0;
5143         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5144             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5145
5146         if (!isOpenCLCopyingForced() &&
5147             (isOpenCLMapForced() ||
5148                 (dev.hostUnifiedMemory()
5149 #ifndef __APPLE__
5150                 || dev.isIntel()
5151 #endif
5152                 )
5153             )
5154         )
5155             flags0 = static_cast<UMatData::MemoryFlag>(0);
5156         else
5157             flags0 = UMatData::COPY_ON_MAP;
5158     }
5159
5160     UMatData* allocate(int dims, const int* sizes, int type,
5161                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5162     {
5163         if(!useOpenCL())
5164             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5165
5166         flushCleanupQueue();
5167
5168         CV_Assert(data == 0);
5169         size_t total = CV_ELEM_SIZE(type);
5170         for( int i = dims-1; i >= 0; i-- )
5171         {
5172             if( step )
5173                 step[i] = total;
5174             total *= sizes[i];
5175         }
5176
5177         Context& ctx = Context::getDefault();
5178         if (!ctx.getImpl())
5179             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5180         Context::Impl& ctxImpl = *ctx.getImpl();
5181
5182         int createFlags = 0;
5183         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5184         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5185
5186         void* handle = NULL;
5187         int allocatorFlags = 0;
5188
5189 #ifdef HAVE_OPENCL_SVM
5190         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5191         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5192         {
5193             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5194             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5195
5196             // this property is constant, so single buffer pool can be used here
5197             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5198             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5199         }
5200         else
5201 #endif
5202         if (createFlags == 0)
5203         {
5204             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5205             handle = ctxImpl.getBufferPool().allocate(total);
5206         }
5207         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5208         {
5209             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5210             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5211         }
5212         else
5213         {
5214             CV_Assert(handle != NULL); // Unsupported, throw
5215         }
5216
5217         if (!handle)
5218             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5219
5220         UMatData* u = new UMatData(this);
5221         u->data = 0;
5222         u->size = total;
5223         u->handle = handle;
5224         u->flags = flags0;
5225         u->allocatorFlags_ = allocatorFlags;
5226         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5227         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5228         u->markHostCopyObsolete(true);
5229         opencl_allocator_stats.onAllocate(u->size);
5230         return u;
5231     }
5232
5233     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5234     {
5235         if(!u)
5236             return false;
5237
5238         flushCleanupQueue();
5239
5240         UMatDataAutoLock lock(u);
5241
5242         if(u->handle == 0)
5243         {
5244             CV_Assert(u->origdata != 0);
5245             Context& ctx = Context::getDefault();
5246             int createFlags = 0;
5247             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5248             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5249
5250             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5251
5252             cl_context ctx_handle = (cl_context)ctx.ptr();
5253             int allocatorFlags = 0;
5254             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5255             void* handle = NULL;
5256             cl_int retval = CL_SUCCESS;
5257
5258 #ifdef HAVE_OPENCL_SVM
5259             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5260             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5261             if (useSVM && svmCaps.isSupportFineGrainSystem())
5262             {
5263                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5264                 tempUMatFlags = UMatData::TEMP_UMAT;
5265                 handle = u->origdata;
5266                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5267             }
5268             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5269             {
5270                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5271                 {
5272                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5273
5274                     cl_svm_mem_flags memFlags = createFlags |
5275                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5276
5277                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5278                     CV_DbgAssert(svmFns->isValid());
5279
5280                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5281                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5282                     CV_Assert(handle);
5283
5284                     cl_command_queue q = NULL;
5285                     if (!isFineGrainBuffer)
5286                     {
5287                         q = (cl_command_queue)Queue::getDefault().ptr();
5288                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5289                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5290                                 handle, u->size,
5291                                 0, NULL, NULL);
5292                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5293
5294                     }
5295                     memcpy(handle, u->origdata, u->size);
5296                     if (!isFineGrainBuffer)
5297                     {
5298                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5299                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5300                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5301                     }
5302
5303                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5304                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5305                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5306                 }
5307             }
5308             else
5309 #endif
5310             {
5311                 if( copyOnMap )
5312                     accessFlags &= ~ACCESS_FAST;
5313
5314                 tempUMatFlags = UMatData::TEMP_UMAT;
5315                 if (
5316                 #ifdef __APPLE__
5317                     !copyOnMap &&
5318                 #endif
5319                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5320                     // There are OpenCL runtime issues for less aligned data
5321                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5322                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5323                     // Avoid sharing of host memory between OpenCL buffers
5324                     && !(u->originalUMatData && u->originalUMatData->handle)
5325                 )
5326                 {
5327                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5328                                             u->size, u->origdata, &retval);
5329                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5330                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5331                 }
5332                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5333                 {
5334                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5335                                                u->size, u->origdata, &retval);
5336                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5337                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5338                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5339                 }
5340             }
5341             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5342             if(!handle || retval != CL_SUCCESS)
5343                 return false;
5344             u->handle = handle;
5345             u->prevAllocator = u->currAllocator;
5346             u->currAllocator = this;
5347             u->flags |= tempUMatFlags | flags0;
5348             u->allocatorFlags_ = allocatorFlags;
5349         }
5350         if (!!(accessFlags & ACCESS_WRITE))
5351             u->markHostCopyObsolete(true);
5352         opencl_allocator_stats.onAllocate(u->size);
5353         return true;
5354     }
5355
5356     /*void sync(UMatData* u) const
5357     {
5358         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5359         UMatDataAutoLock lock(u);
5360
5361         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5362         {
5363             if( u->tempCopiedUMat() )
5364             {
5365                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5366                                     u->size, u->origdata, 0, 0, 0);
5367             }
5368             else
5369             {
5370                 cl_int retval = 0;
5371                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5372                                                 (CL_MAP_READ | CL_MAP_WRITE),
5373                                                 0, u->size, 0, 0, 0, &retval);
5374                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5375                 clFinish(q);
5376             }
5377             u->markHostCopyObsolete(false);
5378         }
5379         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5380         {
5381             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5382                                  u->size, u->data, 0, 0, 0);
5383         }
5384     }*/
5385
5386     void deallocate(UMatData* u) const CV_OVERRIDE
5387     {
5388         if(!u)
5389             return;
5390
5391         CV_Assert(u->urefcount == 0);
5392         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5393
5394         CV_Assert(u->handle != 0);
5395         CV_Assert(u->mapcount == 0);
5396
5397         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5398             addToCleanupQueue(u);
5399         else
5400             deallocate_(u);
5401     }
5402
5403     void deallocate_(UMatData* u) const
5404     {
5405         CV_Assert(u);
5406         CV_Assert(u->handle);
5407         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5408         {
5409             opencl_allocator_stats.onFree(u->size);
5410         }
5411
5412 #ifdef _WIN32
5413         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5414             return;             // avoid any OpenCL calls
5415 #endif
5416         if(u->tempUMat())
5417         {
5418             CV_Assert(u->origdata);
5419 //            UMatDataAutoLock lock(u);
5420
5421             if (u->hostCopyObsolete())
5422             {
5423 #ifdef HAVE_OPENCL_SVM
5424                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5425                 {
5426                     Context& ctx = Context::getDefault();
5427                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5428                     CV_DbgAssert(svmFns->isValid());
5429
5430                     if( u->tempCopiedUMat() )
5431                     {
5432                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5433                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5434                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5435                         cl_command_queue q = NULL;
5436                         if (!isFineGrainBuffer)
5437                         {
5438                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5439                             q = (cl_command_queue)Queue::getDefault().ptr();
5440                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5441                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5442                                     u->handle, u->size,
5443                                     0, NULL, NULL);
5444                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5445                         }
5446                         clFinish(q);
5447                         memcpy(u->origdata, u->handle, u->size);
5448                         if (!isFineGrainBuffer)
5449                         {
5450                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5451                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5452                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5453                         }
5454                     }
5455                     else
5456                     {
5457                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5458                         // nothing
5459                     }
5460                 }
5461                 else
5462 #endif
5463                 {
5464                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5465                     if( u->tempCopiedUMat() )
5466                     {
5467                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5468                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5469                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5470                     }
5471                     else
5472                     {
5473                         cl_int retval = 0;
5474                         if (u->tempUMat())
5475                         {
5476                             CV_Assert(u->mapcount == 0);
5477                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5478                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5479                                 (CL_MAP_READ | CL_MAP_WRITE),
5480                                 0, u->size, 0, 0, 0, &retval);
5481                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5482                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5483                             if (u->originalUMatData)
5484                             {
5485                                 CV_Assert(u->originalUMatData->data == data);
5486                             }
5487                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5488                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5489                             CV_OCL_DBG_CHECK(clFinish(q));
5490                         }
5491                     }
5492                 }
5493                 u->markHostCopyObsolete(false);
5494             }
5495             else
5496             {
5497                 // nothing
5498             }
5499 #ifdef HAVE_OPENCL_SVM
5500             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5501             {
5502                 if( u->tempCopiedUMat() )
5503                 {
5504                     Context& ctx = Context::getDefault();
5505                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5506                     CV_DbgAssert(svmFns->isValid());
5507
5508                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5509                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5510                 }
5511             }
5512             else
5513 #endif
5514             {
5515                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5516                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5517             }
5518             u->handle = 0;
5519             u->markDeviceCopyObsolete(true);
5520             u->currAllocator = u->prevAllocator;
5521             u->prevAllocator = NULL;
5522             if(u->data && u->copyOnMap() && u->data != u->origdata)
5523                 fastFree(u->data);
5524             u->data = u->origdata;
5525             u->currAllocator->deallocate(u);
5526             u = NULL;
5527         }
5528         else
5529         {
5530             CV_Assert(u->origdata == NULL);
5531             if(u->data && u->copyOnMap() && u->data != u->origdata)
5532             {
5533                 fastFree(u->data);
5534                 u->data = 0;
5535                 u->markHostCopyObsolete(true);
5536             }
5537             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5538             {
5539                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5540                 CV_Assert(pCtx);
5541                 ocl::Context& ctx = *pCtx.get();
5542                 CV_Assert(ctx.getImpl());
5543                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5544             }
5545             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5546             {
5547                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5548                 CV_Assert(pCtx);
5549                 ocl::Context& ctx = *pCtx.get();
5550                 CV_Assert(ctx.getImpl());
5551                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5552             }
5553 #ifdef HAVE_OPENCL_SVM
5554             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5555             {
5556                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5557                 CV_Assert(pCtx);
5558                 ocl::Context& ctx = *pCtx.get();
5559                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5560                 {
5561                     //nothing
5562                 }
5563                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5564                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5565                 {
5566                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5567                     CV_DbgAssert(svmFns->isValid());
5568                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5569
5570                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5571                     {
5572                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5573                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5574                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5575                     }
5576                 }
5577                 CV_Assert(ctx.getImpl());
5578                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5579             }
5580 #endif
5581             else
5582             {
5583                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5584             }
5585             u->handle = 0;
5586             u->markDeviceCopyObsolete(true);
5587             delete u;
5588             u = NULL;
5589         }
5590         CV_Assert(u == NULL);
5591     }
5592
5593     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5594     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5595     {
5596         CV_Assert(u && u->handle);
5597
5598         if (!!(accessFlags & ACCESS_WRITE))
5599             u->markDeviceCopyObsolete(true);
5600
5601         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5602
5603         {
5604             if( !u->copyOnMap() )
5605             {
5606                 // TODO
5607                 // because there can be other map requests for the same UMat with different access flags,
5608                 // we use the universal (read-write) access mode.
5609 #ifdef HAVE_OPENCL_SVM
5610                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5611                 {
5612                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5613                     {
5614                         Context& ctx = Context::getDefault();
5615                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5616                         CV_DbgAssert(svmFns->isValid());
5617
5618                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5619                         {
5620                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5621                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5622                                     u->handle, u->size,
5623                                     0, NULL, NULL);
5624                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5625                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5626                         }
5627                     }
5628                     clFinish(q);
5629                     u->data = (uchar*)u->handle;
5630                     u->markHostCopyObsolete(false);
5631                     u->markDeviceMemMapped(true);
5632                     return;
5633                 }
5634 #endif
5635
5636                 cl_int retval = CL_SUCCESS;
5637                 if (!u->deviceMemMapped())
5638                 {
5639                     CV_Assert(u->refcount == 1);
5640                     CV_Assert(u->mapcount++ == 0);
5641                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5642                                                          (CL_MAP_READ | CL_MAP_WRITE),
5643                                                          0, u->size, 0, 0, 0, &retval);
5644                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5645                 }
5646                 if (u->data && retval == CL_SUCCESS)
5647                 {
5648                     u->markHostCopyObsolete(false);
5649                     u->markDeviceMemMapped(true);
5650                     return;
5651                 }
5652
5653                 // TODO Is it really a good idea and was it tested well?
5654                 // if map failed, switch to copy-on-map mode for the particular buffer
5655                 u->flags |= UMatData::COPY_ON_MAP;
5656             }
5657
5658             if(!u->data)
5659             {
5660                 u->data = (uchar*)fastMalloc(u->size);
5661                 u->markHostCopyObsolete(true);
5662             }
5663         }
5664
5665         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5666         {
5667             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5668 #ifdef HAVE_OPENCL_SVM
5669             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5670 #endif
5671             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5672                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5673             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5674                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5675             u->markHostCopyObsolete(false);
5676         }
5677     }
5678
5679     void unmap(UMatData* u) const CV_OVERRIDE
5680     {
5681         if(!u)
5682             return;
5683
5684
5685         CV_Assert(u->handle != 0);
5686
5687         UMatDataAutoLock autolock(u);
5688
5689         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5690         cl_int retval = 0;
5691         if( !u->copyOnMap() && u->deviceMemMapped() )
5692         {
5693             CV_Assert(u->data != NULL);
5694 #ifdef HAVE_OPENCL_SVM
5695             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5696             {
5697                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5698                 {
5699                     Context& ctx = Context::getDefault();
5700                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5701                     CV_DbgAssert(svmFns->isValid());
5702
5703                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5704                     {
5705                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5706                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5707                                 0, NULL, NULL);
5708                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5709                         clFinish(q);
5710                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5711                     }
5712                 }
5713                 if (u->refcount == 0)
5714                     u->data = 0;
5715                 u->markDeviceCopyObsolete(false);
5716                 u->markHostCopyObsolete(true);
5717                 return;
5718             }
5719 #endif
5720             if (u->refcount == 0)
5721             {
5722                 CV_Assert(u->mapcount-- == 1);
5723                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5724                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5725                 if (Device::getDefault().isAMD())
5726                 {
5727                     // required for multithreaded applications (see stitching test)
5728                     CV_OCL_DBG_CHECK(clFinish(q));
5729                 }
5730                 u->markDeviceMemMapped(false);
5731                 u->data = 0;
5732                 u->markDeviceCopyObsolete(false);
5733                 u->markHostCopyObsolete(true);
5734             }
5735         }
5736         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5737         {
5738             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5739 #ifdef HAVE_OPENCL_SVM
5740             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5741 #endif
5742             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5743                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5744             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5745                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5746             u->markDeviceCopyObsolete(false);
5747             u->markHostCopyObsolete(true);
5748         }
5749     }
5750
5751     bool checkContinuous(int dims, const size_t sz[],
5752                          const size_t srcofs[], const size_t srcstep[],
5753                          const size_t dstofs[], const size_t dststep[],
5754                          size_t& total, size_t new_sz[],
5755                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5756                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5757     {
5758         bool iscontinuous = true;
5759         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5760         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5761         total = sz[dims-1];
5762         for( int i = dims-2; i >= 0; i-- )
5763         {
5764             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5765                 iscontinuous = false;
5766             total *= sz[i];
5767             if( srcofs )
5768                 srcrawofs += srcofs[i]*srcstep[i];
5769             if( dstofs )
5770                 dstrawofs += dstofs[i]*dststep[i];
5771         }
5772
5773         if( !iscontinuous )
5774         {
5775             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5776             if( dims == 2 )
5777             {
5778                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5779                 // we assume that new_... arrays are initialized by caller
5780                 // with 0's, so there is no else branch
5781                 if( srcofs )
5782                 {
5783                     new_srcofs[0] = srcofs[1];
5784                     new_srcofs[1] = srcofs[0];
5785                     new_srcofs[2] = 0;
5786                 }
5787
5788                 if( dstofs )
5789                 {
5790                     new_dstofs[0] = dstofs[1];
5791                     new_dstofs[1] = dstofs[0];
5792                     new_dstofs[2] = 0;
5793                 }
5794
5795                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5796                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5797             }
5798             else
5799             {
5800                 // we could check for dims == 3 here,
5801                 // but from user perspective this one is more informative
5802                 CV_Assert(dims <= 3);
5803                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5804                 if( srcofs )
5805                 {
5806                     new_srcofs[0] = srcofs[2];
5807                     new_srcofs[1] = srcofs[1];
5808                     new_srcofs[2] = srcofs[0];
5809                 }
5810
5811                 if( dstofs )
5812                 {
5813                     new_dstofs[0] = dstofs[2];
5814                     new_dstofs[1] = dstofs[1];
5815                     new_dstofs[2] = dstofs[0];
5816                 }
5817
5818                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5819                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5820             }
5821         }
5822         return iscontinuous;
5823     }
5824
5825     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5826                   const size_t srcofs[], const size_t srcstep[],
5827                   const size_t dststep[]) const CV_OVERRIDE
5828     {
5829         if(!u)
5830             return;
5831         UMatDataAutoLock autolock(u);
5832
5833         if( u->data && !u->hostCopyObsolete() )
5834         {
5835             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5836             return;
5837         }
5838         CV_Assert( u->handle != 0 );
5839
5840         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5841
5842         size_t total = 0, new_sz[] = {0, 0, 0};
5843         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5844         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5845
5846         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5847                                             total, new_sz,
5848                                             srcrawofs, new_srcofs, new_srcstep,
5849                                             dstrawofs, new_dstofs, new_dststep);
5850
5851 #ifdef HAVE_OPENCL_SVM
5852         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5853         {
5854             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5855             Context& ctx = Context::getDefault();
5856             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5857             CV_DbgAssert(svmFns->isValid());
5858
5859             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5860             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5861             {
5862                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5863                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5864                         u->handle, u->size,
5865                         0, NULL, NULL);
5866                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5867             }
5868             clFinish(q);
5869             if( iscontinuous )
5870             {
5871                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5872             }
5873             else
5874             {
5875                 // This code is from MatAllocator::download()
5876                 int isz[CV_MAX_DIM];
5877                 uchar* srcptr = (uchar*)u->handle;
5878                 for( int i = 0; i < dims; i++ )
5879                 {
5880                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5881                     if( sz[i] == 0 )
5882                     return;
5883                     if( srcofs )
5884                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5885                     isz[i] = (int)sz[i];
5886                 }
5887
5888                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5889                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5890
5891                 const Mat* arrays[] = { &src, &dst };
5892                 uchar* ptrs[2];
5893                 NAryMatIterator it(arrays, ptrs, 2);
5894                 size_t j, planesz = it.size;
5895
5896                 for( j = 0; j < it.nplanes; j++, ++it )
5897                     memcpy(ptrs[1], ptrs[0], planesz);
5898             }
5899             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5900             {
5901                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5902                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5903                         0, NULL, NULL);
5904                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5905                 clFinish(q);
5906             }
5907         }
5908         else
5909 #endif
5910         {
5911             if( iscontinuous )
5912             {
5913                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5914                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5915                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5916             }
5917             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5918             {
5919                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5920                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5921                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5922                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5923                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5924                 uchar* ptr = alignedPtr.getAlignedPtr();
5925
5926                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5927                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5928                 total = std::min(total, u->size - new_srcrawofs);
5929                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5930                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5931                 for( size_t i = 0; i < new_sz[1]; i++ )
5932                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5933             }
5934             else
5935             {
5936                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5937                 uchar* ptr = alignedPtr.getAlignedPtr();
5938
5939                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5940                     new_srcofs, new_dstofs, new_sz,
5941                     new_srcstep[0], 0,
5942                     new_dststep[0], 0,
5943                     ptr, 0, 0, 0));
5944             }
5945         }
5946     }
5947
5948     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5949                 const size_t dstofs[], const size_t dststep[],
5950                 const size_t srcstep[]) const CV_OVERRIDE
5951     {
5952         if(!u)
5953             return;
5954
5955         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5956         CV_Assert(u->refcount == 0 || u->tempUMat());
5957
5958         size_t total = 0, new_sz[] = {0, 0, 0};
5959         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5960         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5961
5962         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5963                                             total, new_sz,
5964                                             srcrawofs, new_srcofs, new_srcstep,
5965                                             dstrawofs, new_dstofs, new_dststep);
5966
5967         UMatDataAutoLock autolock(u);
5968
5969         // if there is cached CPU copy of the GPU matrix,
5970         // we could use it as a destination.
5971         // we can do it in 2 cases:
5972         //    1. we overwrite the whole content
5973         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5974         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5975         {
5976             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5977             u->markHostCopyObsolete(false);
5978             u->markDeviceCopyObsolete(true);
5979             return;
5980         }
5981
5982         CV_Assert( u->handle != 0 );
5983         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5984
5985 #ifdef HAVE_OPENCL_SVM
5986         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5987         {
5988             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5989             Context& ctx = Context::getDefault();
5990             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5991             CV_DbgAssert(svmFns->isValid());
5992
5993             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5994             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5995             {
5996                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5997                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5998                         u->handle, u->size,
5999                         0, NULL, NULL);
6000                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6001             }
6002             clFinish(q);
6003             if( iscontinuous )
6004             {
6005                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6006             }
6007             else
6008             {
6009                 // This code is from MatAllocator::upload()
6010                 int isz[CV_MAX_DIM];
6011                 uchar* dstptr = (uchar*)u->handle;
6012                 for( int i = 0; i < dims; i++ )
6013                 {
6014                     CV_Assert( sz[i] <= (size_t)INT_MAX );
6015                     if( sz[i] == 0 )
6016                     return;
6017                     if( dstofs )
6018                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6019                     isz[i] = (int)sz[i];
6020                 }
6021
6022                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6023                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6024
6025                 const Mat* arrays[] = { &src, &dst };
6026                 uchar* ptrs[2];
6027                 NAryMatIterator it(arrays, ptrs, 2);
6028                 size_t j, planesz = it.size;
6029
6030                 for( j = 0; j < it.nplanes; j++, ++it )
6031                     memcpy(ptrs[1], ptrs[0], planesz);
6032             }
6033             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6034             {
6035                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6036                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6037                         0, NULL, NULL);
6038                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6039                 clFinish(q);
6040             }
6041         }
6042         else
6043 #endif
6044         {
6045             if( iscontinuous )
6046             {
6047                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6048                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6049                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6050                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6051                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6052             }
6053             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6054             {
6055                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6056                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6057                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6058                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6059                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6060                 uchar* ptr = alignedPtr.getAlignedPtr();
6061
6062                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6063                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6064                 total = std::min(total, u->size - new_dstrawofs);
6065                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6066                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6067                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6068                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6069                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6070                 for( size_t i = 0; i < new_sz[1]; i++ )
6071                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6072                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6073                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6074             }
6075             else
6076             {
6077                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6078                 uchar* ptr = alignedPtr.getAlignedPtr();
6079
6080                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6081                     new_dstofs, new_srcofs, new_sz,
6082                     new_dststep[0], 0,
6083                     new_srcstep[0], 0,
6084                     ptr, 0, 0, 0));
6085             }
6086         }
6087         u->markHostCopyObsolete(true);
6088 #ifdef HAVE_OPENCL_SVM
6089         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6090                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6091         {
6092             // nothing
6093         }
6094         else
6095 #endif
6096         {
6097             u->markHostCopyObsolete(true);
6098         }
6099         u->markDeviceCopyObsolete(false);
6100     }
6101
6102     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6103               const size_t srcofs[], const size_t srcstep[],
6104               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6105     {
6106         if(!src || !dst)
6107             return;
6108
6109         size_t total = 0, new_sz[] = {0, 0, 0};
6110         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6111         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6112
6113         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6114                                             total, new_sz,
6115                                             srcrawofs, new_srcofs, new_srcstep,
6116                                             dstrawofs, new_dstofs, new_dststep);
6117
6118         UMatDataAutoLock src_autolock(src, dst);
6119
6120         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6121         {
6122             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6123             return;
6124         }
6125         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6126         {
6127             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6128             dst->markHostCopyObsolete(false);
6129 #ifdef HAVE_OPENCL_SVM
6130             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6131                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6132             {
6133                 // nothing
6134             }
6135             else
6136 #endif
6137             {
6138                 dst->markDeviceCopyObsolete(true);
6139             }
6140             return;
6141         }
6142
6143         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6144         CV_Assert(dst->refcount == 0);
6145         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6146
6147         cl_int retval = CL_SUCCESS;
6148 #ifdef HAVE_OPENCL_SVM
6149         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6150                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6151         {
6152             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6153                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6154             {
6155                 Context& ctx = Context::getDefault();
6156                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6157                 CV_DbgAssert(svmFns->isValid());
6158
6159                 if( iscontinuous )
6160                 {
6161                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6162                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6163                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6164                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6165                             total, 0, NULL, NULL);
6166                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6167                 }
6168                 else
6169                 {
6170                     clFinish(q);
6171                     // This code is from MatAllocator::download()/upload()
6172                     int isz[CV_MAX_DIM];
6173                     uchar* srcptr = (uchar*)src->handle;
6174                     for( int i = 0; i < dims; i++ )
6175                     {
6176                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6177                         if( sz[i] == 0 )
6178                         return;
6179                         if( srcofs )
6180                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6181                         isz[i] = (int)sz[i];
6182                     }
6183                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6184
6185                     uchar* dstptr = (uchar*)dst->handle;
6186                     for( int i = 0; i < dims; i++ )
6187                     {
6188                         if( dstofs )
6189                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6190                     }
6191                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6192
6193                     const Mat* arrays[] = { &m_src, &m_dst };
6194                     uchar* ptrs[2];
6195                     NAryMatIterator it(arrays, ptrs, 2);
6196                     size_t j, planesz = it.size;
6197
6198                     for( j = 0; j < it.nplanes; j++, ++it )
6199                         memcpy(ptrs[1], ptrs[0], planesz);
6200                 }
6201             }
6202             else
6203             {
6204                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6205                 {
6206                     map(src, ACCESS_READ);
6207                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6208                     unmap(src);
6209                 }
6210                 else
6211                 {
6212                     map(dst, ACCESS_WRITE);
6213                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6214                     unmap(dst);
6215                 }
6216             }
6217         }
6218         else
6219 #endif
6220         {
6221             if( iscontinuous )
6222             {
6223                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6224                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6225                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6226                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6227             }
6228             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6229             {
6230                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6231                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6232                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6233                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6234                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6235
6236                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6237                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6238                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6239                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6240                 uchar* srcptr = srcBuf.getAlignedPtr();
6241                 uchar* dstptr = dstBuf.getAlignedPtr();
6242
6243                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6244
6245                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6246                 src_total = std::min(src_total, src->size - new_srcrawofs);
6247                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6248                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6249
6250                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6251                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6252                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6253                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6254
6255                 for( size_t i = 0; i < new_sz[1]; i++ )
6256                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6257                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6258                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6259                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6260             }
6261             else
6262             {
6263                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6264                                                    new_srcofs, new_dstofs, new_sz,
6265                                                    new_srcstep[0], 0,
6266                                                    new_dststep[0], 0,
6267                                                    0, 0, 0));
6268             }
6269         }
6270         if (retval == CL_SUCCESS)
6271         {
6272             CV_IMPL_ADD(CV_IMPL_OCL)
6273         }
6274
6275 #ifdef HAVE_OPENCL_SVM
6276         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6277             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6278         {
6279             // nothing
6280         }
6281         else
6282 #endif
6283         {
6284             dst->markHostCopyObsolete(true);
6285         }
6286         dst->markDeviceCopyObsolete(false);
6287
6288         if( _sync )
6289         {
6290             CV_OCL_DBG_CHECK(clFinish(q));
6291         }
6292     }
6293
6294     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6295     {
6296         ocl::Context ctx = Context::getDefault();
6297         if (ctx.empty())
6298             return NULL;
6299 #ifdef HAVE_OPENCL_SVM
6300         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6301         {
6302             return &ctx.getImpl()->getBufferPoolSVM();
6303         }
6304 #endif
6305         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6306         {
6307             return &ctx.getImpl()->getBufferPoolHostPtr();
6308         }
6309         if (id != NULL && strcmp(id, "OCL") != 0)
6310         {
6311             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6312         }
6313         return &ctx.getImpl()->getBufferPool();
6314     }
6315
6316     MatAllocator* matStdAllocator;
6317
6318     mutable cv::Mutex cleanupQueueMutex;
6319     mutable std::deque<UMatData*> cleanupQueue;
6320
6321     void flushCleanupQueue() const
6322     {
6323         if (!cleanupQueue.empty())
6324         {
6325             std::deque<UMatData*> q;
6326             {
6327                 cv::AutoLock lock(cleanupQueueMutex);
6328                 q.swap(cleanupQueue);
6329             }
6330             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6331             {
6332                 deallocate_(*i);
6333             }
6334         }
6335     }
6336     void addToCleanupQueue(UMatData* u) const
6337     {
6338         //TODO: Validation check: CV_Assert(!u->tempUMat());
6339         {
6340             cv::AutoLock lock(cleanupQueueMutex);
6341             cleanupQueue.push_back(u);
6342         }
6343     }
6344 };
6345
6346 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6347 {
6348     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6349     g_isOpenCVActivated = true;
6350     return g_allocator;
6351 }
6352 MatAllocator* getOpenCLAllocator()
6353 {
6354     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6355 }
6356
6357 }} // namespace cv::ocl
6358
6359
6360 namespace cv {
6361
6362 // three funcs below are implemented in umatrix.cpp
6363 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6364               bool autoSteps = false );
6365 void finalizeHdr(UMat& m);
6366
6367 } // namespace cv
6368
6369
6370 namespace cv { namespace ocl {
6371
6372 /*
6373 // Convert OpenCL buffer memory to UMat
6374 */
6375 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6376 {
6377     int d = 2;
6378     int sizes[] = { rows, cols };
6379
6380     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6381
6382     dst.release();
6383
6384     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6385     dst.usageFlags = USAGE_DEFAULT;
6386
6387     setSize(dst, d, sizes, 0, true);
6388     dst.offset = 0;
6389
6390     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6391     cl_mem_object_type mem_type = 0;
6392
6393     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6394
6395     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6396
6397     size_t total = 0;
6398     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6399
6400     CV_OCL_CHECK(clRetainMemObject(memobj));
6401
6402     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6403     CV_Assert(total >= rows * step);
6404
6405     // attach clBuffer to UMatData
6406     dst.u = new UMatData(getOpenCLAllocator());
6407     dst.u->data            = 0;
6408     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6409     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6410     dst.u->handle          = cl_mem_buffer;
6411     dst.u->origdata        = 0;
6412     dst.u->prevAllocator   = 0;
6413     dst.u->size            = total;
6414
6415     finalizeHdr(dst);
6416     dst.addref();
6417
6418     return;
6419 } // convertFromBuffer()
6420
6421
6422 /*
6423 // Convert OpenCL image2d_t memory to UMat
6424 */
6425 void convertFromImage(void* cl_mem_image, UMat& dst)
6426 {
6427     cl_mem             clImage = (cl_mem)cl_mem_image;
6428     cl_mem_object_type mem_type = 0;
6429
6430     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6431
6432     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6433
6434     cl_image_format fmt = { 0, 0 };
6435     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6436
6437     int depth = CV_8U;
6438     switch (fmt.image_channel_data_type)
6439     {
6440     case CL_UNORM_INT8:
6441     case CL_UNSIGNED_INT8:
6442         depth = CV_8U;
6443         break;
6444
6445     case CL_SNORM_INT8:
6446     case CL_SIGNED_INT8:
6447         depth = CV_8S;
6448         break;
6449
6450     case CL_UNORM_INT16:
6451     case CL_UNSIGNED_INT16:
6452         depth = CV_16U;
6453         break;
6454
6455     case CL_SNORM_INT16:
6456     case CL_SIGNED_INT16:
6457         depth = CV_16S;
6458         break;
6459
6460     case CL_SIGNED_INT32:
6461         depth = CV_32S;
6462         break;
6463
6464     case CL_FLOAT:
6465         depth = CV_32F;
6466         break;
6467
6468     default:
6469         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6470     }
6471
6472     int type = CV_8UC1;
6473     switch (fmt.image_channel_order)
6474     {
6475     case CL_R:
6476         type = CV_MAKE_TYPE(depth, 1);
6477         break;
6478
6479     case CL_RGBA:
6480     case CL_BGRA:
6481     case CL_ARGB:
6482         type = CV_MAKE_TYPE(depth, 4);
6483         break;
6484
6485     default:
6486         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6487         break;
6488     }
6489
6490     size_t step = 0;
6491     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6492
6493     size_t w = 0;
6494     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6495
6496     size_t h = 0;
6497     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6498
6499     dst.create((int)h, (int)w, type);
6500
6501     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6502
6503     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6504
6505     size_t offset = 0;
6506     size_t src_origin[3] = { 0, 0, 0 };
6507     size_t region[3] = { w, h, 1 };
6508     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6509
6510     CV_OCL_CHECK(clFinish(q));
6511
6512     return;
6513 } // convertFromImage()
6514
6515
6516 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6517
6518 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6519 {
6520     cl_uint numDevices = 0;
6521     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6522     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6523     {
6524         CV_OCL_DBG_CHECK_RESULT(status,
6525             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6526     }
6527
6528     if (numDevices == 0)
6529     {
6530         devices.clear();
6531         return;
6532     }
6533
6534     devices.resize((size_t)numDevices);
6535     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6536 }
6537
6538 struct PlatformInfo::Impl
6539 {
6540     Impl(void* id)
6541     {
6542         refcount = 1;
6543         handle = *(cl_platform_id*)id;
6544         getDevices(devices, handle);
6545     }
6546
6547     String getStrProp(cl_platform_info prop) const
6548     {
6549         char buf[1024];
6550         size_t sz=0;
6551         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6552             sz < sizeof(buf) ? String(buf) : String();
6553     }
6554
6555     IMPLEMENT_REFCOUNTABLE();
6556     std::vector<cl_device_id> devices;
6557     cl_platform_id handle;
6558 };
6559
6560 PlatformInfo::PlatformInfo()
6561 {
6562     p = 0;
6563 }
6564
6565 PlatformInfo::PlatformInfo(void* platform_id)
6566 {
6567     p = new Impl(platform_id);
6568 }
6569
6570 PlatformInfo::~PlatformInfo()
6571 {
6572     if(p)
6573         p->release();
6574 }
6575
6576 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6577 {
6578     if (i.p)
6579         i.p->addref();
6580     p = i.p;
6581 }
6582
6583 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6584 {
6585     if (i.p != p)
6586     {
6587         if (i.p)
6588             i.p->addref();
6589         if (p)
6590             p->release();
6591         p = i.p;
6592     }
6593     return *this;
6594 }
6595
6596 int PlatformInfo::deviceNumber() const
6597 {
6598     return p ? (int)p->devices.size() : 0;
6599 }
6600
6601 void PlatformInfo::getDevice(Device& device, int d) const
6602 {
6603     CV_Assert(p && d < (int)p->devices.size() );
6604     if(p)
6605         device.set(p->devices[d]);
6606 }
6607
6608 String PlatformInfo::name() const
6609 {
6610     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6611 }
6612
6613 String PlatformInfo::vendor() const
6614 {
6615     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6616 }
6617
6618 String PlatformInfo::version() const
6619 {
6620     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6621 }
6622
6623 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6624 {
6625     cl_uint numPlatforms = 0;
6626     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6627
6628     if (numPlatforms == 0)
6629     {
6630         platforms.clear();
6631         return;
6632     }
6633
6634     platforms.resize((size_t)numPlatforms);
6635     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6636 }
6637
6638 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6639 {
6640     std::vector<cl_platform_id> platforms;
6641     getPlatforms(platforms);
6642
6643     for (size_t i = 0; i < platforms.size(); i++)
6644         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6645 }
6646
6647 const char* typeToStr(int type)
6648 {
6649     static const char* tab[]=
6650     {
6651         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6652         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6653         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6654         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6655         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6656         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6657         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6658         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6659         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6660     };
6661     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6662     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6663     CV_Assert(result);
6664     return result;
6665 }
6666
6667 const char* memopTypeToStr(int type)
6668 {
6669     static const char* tab[] =
6670     {
6671         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6672         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6673         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6674         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6675         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6676         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6677         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6678         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6679         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6680     };
6681     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6682     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6683     CV_Assert(result);
6684     return result;
6685 }
6686
6687 const char* vecopTypeToStr(int type)
6688 {
6689     static const char* tab[] =
6690     {
6691         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6692         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6693         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6694         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6695         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6696         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6697         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6698         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6699         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6700     };
6701     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6702     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6703     CV_Assert(result);
6704     return result;
6705 }
6706
6707 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6708 {
6709     if( sdepth == ddepth )
6710         return "noconvert";
6711     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6712     if( ddepth >= CV_32F ||
6713         (ddepth == CV_32S && sdepth < CV_32S) ||
6714         (ddepth == CV_16S && sdepth <= CV_8S) ||
6715         (ddepth == CV_16U && sdepth == CV_8U))
6716     {
6717         sprintf(buf, "convert_%s", typestr);
6718     }
6719     else if( sdepth >= CV_32F )
6720         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6721     else
6722         sprintf(buf, "convert_%s_sat", typestr);
6723
6724     return buf;
6725 }
6726
6727 const char* getOpenCLErrorString(int errorCode)
6728 {
6729 #define CV_OCL_CODE(id) case id: return #id
6730 #define CV_OCL_CODE_(id, name) case id: return #name
6731     switch (errorCode)
6732     {
6733     CV_OCL_CODE(CL_SUCCESS);
6734     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6735     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6736     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6737     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6738     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6739     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6740     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6741     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6742     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6743     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6744     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6745     CV_OCL_CODE(CL_MAP_FAILURE);
6746     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6747     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6748     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6749     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6750     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6751     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6752     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6753     CV_OCL_CODE(CL_INVALID_VALUE);
6754     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6755     CV_OCL_CODE(CL_INVALID_PLATFORM);
6756     CV_OCL_CODE(CL_INVALID_DEVICE);
6757     CV_OCL_CODE(CL_INVALID_CONTEXT);
6758     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6759     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6760     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6761     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6762     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6763     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6764     CV_OCL_CODE(CL_INVALID_SAMPLER);
6765     CV_OCL_CODE(CL_INVALID_BINARY);
6766     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6767     CV_OCL_CODE(CL_INVALID_PROGRAM);
6768     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6769     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6770     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6771     CV_OCL_CODE(CL_INVALID_KERNEL);
6772     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6773     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6774     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6775     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6776     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6777     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6778     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6779     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6780     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6781     CV_OCL_CODE(CL_INVALID_EVENT);
6782     CV_OCL_CODE(CL_INVALID_OPERATION);
6783     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6784     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6785     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6786     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6787     // OpenCL 1.1
6788     CV_OCL_CODE(CL_INVALID_PROPERTY);
6789     // OpenCL 1.2
6790     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6791     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6792     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6793     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6794     // OpenCL 2.0
6795     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6796     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6797     // Extensions
6798     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6799     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6800     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6801     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6802     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6803     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6804     default: return "Unknown OpenCL error";
6805     }
6806 #undef CV_OCL_CODE
6807 #undef CV_OCL_CODE_
6808 }
6809
6810 template <typename T>
6811 static std::string kerToStr(const Mat & k)
6812 {
6813     int width = k.cols - 1, depth = k.depth();
6814     const T * const data = k.ptr<T>();
6815
6816     std::ostringstream stream;
6817     stream.precision(10);
6818
6819     if (depth <= CV_8S)
6820     {
6821         for (int i = 0; i < width; ++i)
6822             stream << "DIG(" << (int)data[i] << ")";
6823         stream << "DIG(" << (int)data[width] << ")";
6824     }
6825     else if (depth == CV_32F)
6826     {
6827         stream.setf(std::ios_base::showpoint);
6828         for (int i = 0; i < width; ++i)
6829             stream << "DIG(" << data[i] << "f)";
6830         stream << "DIG(" << data[width] << "f)";
6831     }
6832     else
6833     {
6834         for (int i = 0; i < width; ++i)
6835             stream << "DIG(" << data[i] << ")";
6836         stream << "DIG(" << data[width] << ")";
6837     }
6838
6839     return stream.str();
6840 }
6841
6842 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6843 {
6844     Mat kernel = _kernel.getMat().reshape(1, 1);
6845
6846     int depth = kernel.depth();
6847     if (ddepth < 0)
6848         ddepth = depth;
6849
6850     if (ddepth != depth)
6851         kernel.convertTo(kernel, ddepth);
6852
6853     typedef std::string (* func_t)(const Mat &);
6854     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6855                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6856     const func_t func = funcs[ddepth];
6857     CV_Assert(func != 0);
6858
6859     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6860 }
6861
6862 #define PROCESS_SRC(src) \
6863     do \
6864     { \
6865         if (!src.empty()) \
6866         { \
6867             CV_Assert(src.isMat() || src.isUMat()); \
6868             Size csize = src.size(); \
6869             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6870                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6871             if (cwidth < ckercn || ckercn <= 0) \
6872                 return 1; \
6873             cols.push_back(cwidth); \
6874             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6875                 return 1; \
6876             offsets.push_back(src.offset()); \
6877             steps.push_back(src.step()); \
6878             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6879             kercns.push_back(ckercn); \
6880         } \
6881     } \
6882     while ((void)0, 0)
6883
6884 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6885                               InputArray src4, InputArray src5, InputArray src6,
6886                               InputArray src7, InputArray src8, InputArray src9,
6887                               OclVectorStrategy strat)
6888 {
6889     const ocl::Device & d = ocl::Device::getDefault();
6890
6891     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6892         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6893         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6894         d.preferredVectorWidthDouble(), -1 };
6895
6896     // if the device says don't use vectors
6897     if (vectorWidths[0] == 1)
6898     {
6899         // it's heuristic
6900         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6901         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6902         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6903     }
6904
6905     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6906 }
6907
6908 int checkOptimalVectorWidth(const int *vectorWidths,
6909                             InputArray src1, InputArray src2, InputArray src3,
6910                             InputArray src4, InputArray src5, InputArray src6,
6911                             InputArray src7, InputArray src8, InputArray src9,
6912                             OclVectorStrategy strat)
6913 {
6914     CV_Assert(vectorWidths);
6915
6916     int ref_type = src1.type();
6917
6918     std::vector<size_t> offsets, steps, cols;
6919     std::vector<int> dividers, kercns;
6920     PROCESS_SRC(src1);
6921     PROCESS_SRC(src2);
6922     PROCESS_SRC(src3);
6923     PROCESS_SRC(src4);
6924     PROCESS_SRC(src5);
6925     PROCESS_SRC(src6);
6926     PROCESS_SRC(src7);
6927     PROCESS_SRC(src8);
6928     PROCESS_SRC(src9);
6929
6930     size_t size = offsets.size();
6931
6932     for (size_t i = 0; i < size; ++i)
6933         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6934             dividers[i] >>= 1, kercns[i] >>= 1;
6935
6936     // default strategy
6937     int kercn = *std::min_element(kercns.begin(), kercns.end());
6938
6939     return kercn;
6940 }
6941
6942 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6943                                  InputArray src4, InputArray src5, InputArray src6,
6944                                  InputArray src7, InputArray src8, InputArray src9)
6945 {
6946     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6947 }
6948
6949 #undef PROCESS_SRC
6950
6951
6952 // TODO Make this as a method of OpenCL "BuildOptions" class
6953 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6954 {
6955     if (!buildOptions.empty())
6956         buildOptions += " ";
6957     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6958     buildOptions += format(
6959             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6960             name.c_str(), ocl::typeToStr(type),
6961             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6962             name.c_str(), (int)CV_MAT_CN(type),
6963             name.c_str(), (int)CV_ELEM_SIZE(type),
6964             name.c_str(), (int)CV_ELEM_SIZE1(type),
6965             name.c_str(), (int)depth
6966             );
6967 }
6968
6969
6970 struct Image2D::Impl
6971 {
6972     Impl(const UMat &src, bool norm, bool alias)
6973     {
6974         handle = 0;
6975         refcount = 1;
6976         init(src, norm, alias);
6977     }
6978
6979     ~Impl()
6980     {
6981         if (handle)
6982             clReleaseMemObject(handle);
6983     }
6984
6985     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6986     {
6987         cl_image_format format;
6988         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6989                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6990         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6991                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6992         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6993
6994         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6995         int channelOrder = channelOrders[cn];
6996         format.image_channel_data_type = (cl_channel_type)channelType;
6997         format.image_channel_order = (cl_channel_order)channelOrder;
6998         return format;
6999     }
7000
7001     static bool isFormatSupported(cl_image_format format)
7002     {
7003         if (!haveOpenCL())
7004             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7005
7006         cl_context context = (cl_context)Context::getDefault().ptr();
7007         if (!context)
7008             return false;
7009
7010         // Figure out how many formats are supported by this context.
7011         cl_uint numFormats = 0;
7012         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7013                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
7014                                                 NULL, &numFormats);
7015         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7016         if (numFormats > 0)
7017         {
7018             AutoBuffer<cl_image_format> formats(numFormats);
7019             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7020                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
7021                                              formats.data(), NULL);
7022             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7023             for (cl_uint i = 0; i < numFormats; ++i)
7024             {
7025                 if (!memcmp(&formats[i], &format, sizeof(format)))
7026                 {
7027                     return true;
7028                 }
7029             }
7030         }
7031         return false;
7032     }
7033
7034     void init(const UMat &src, bool norm, bool alias)
7035     {
7036         if (!haveOpenCL())
7037             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7038
7039         CV_Assert(!src.empty());
7040         CV_Assert(ocl::Device::getDefault().imageSupport());
7041
7042         int err, depth = src.depth(), cn = src.channels();
7043         CV_Assert(cn <= 4);
7044         cl_image_format format = getImageFormat(depth, cn, norm);
7045
7046         if (!isFormatSupported(format))
7047             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7048
7049         if (alias && !src.handle(ACCESS_RW))
7050             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7051
7052         cl_context context = (cl_context)Context::getDefault().ptr();
7053         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7054
7055 #ifdef CL_VERSION_1_2
7056         // this enables backwards portability to
7057         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7058         const Device & d = ocl::Device::getDefault();
7059         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7060         CV_Assert(!alias || canCreateAlias(src));
7061         if (1 < major || (1 == major && 2 <= minor))
7062         {
7063             cl_image_desc desc;
7064             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7065             desc.image_width      = src.cols;
7066             desc.image_height     = src.rows;
7067             desc.image_depth      = 0;
7068             desc.image_array_size = 1;
7069             desc.image_row_pitch  = alias ? src.step[0] : 0;
7070             desc.image_slice_pitch = 0;
7071             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7072             desc.num_mip_levels   = 0;
7073             desc.num_samples      = 0;
7074             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7075         }
7076         else
7077 #endif
7078         {
7079             CV_SUPPRESS_DEPRECATED_START
7080             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7081             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7082             CV_SUPPRESS_DEPRECATED_END
7083         }
7084         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7085
7086         size_t origin[] = { 0, 0, 0 };
7087         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7088
7089         cl_mem devData;
7090         if (!alias && !src.isContinuous())
7091         {
7092             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7093             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7094                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7095                 ).c_str());
7096
7097             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7098             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7099                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7100             CV_OCL_DBG_CHECK(clFlush(queue));
7101         }
7102         else
7103         {
7104             devData = (cl_mem)src.handle(ACCESS_READ);
7105         }
7106         CV_Assert(devData != NULL);
7107
7108         if (!alias)
7109         {
7110             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7111             if (!src.isContinuous())
7112             {
7113                 CV_OCL_DBG_CHECK(clFlush(queue));
7114                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7115             }
7116         }
7117     }
7118
7119     IMPLEMENT_REFCOUNTABLE();
7120
7121     cl_mem handle;
7122 };
7123
7124 Image2D::Image2D()
7125 {
7126     p = NULL;
7127 }
7128
7129 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7130 {
7131     p = new Impl(src, norm, alias);
7132 }
7133
7134 bool Image2D::canCreateAlias(const UMat &m)
7135 {
7136     bool ret = false;
7137     const Device & d = ocl::Device::getDefault();
7138     if (d.imageFromBufferSupport() && !m.empty())
7139     {
7140         // This is the required pitch alignment in pixels
7141         uint pitchAlign = d.imagePitchAlignment();
7142         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7143         {
7144             // We don't currently handle the case where the buffer was created
7145             // with CL_MEM_USE_HOST_PTR
7146             if (!m.u->tempUMat())
7147             {
7148                 ret = true;
7149             }
7150         }
7151     }
7152     return ret;
7153 }
7154
7155 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7156 {
7157     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7158
7159     return Impl::isFormatSupported(format);
7160 }
7161
7162 Image2D::Image2D(const Image2D & i)
7163 {
7164     p = i.p;
7165     if (p)
7166         p->addref();
7167 }
7168
7169 Image2D & Image2D::operator = (const Image2D & i)
7170 {
7171     if (i.p != p)
7172     {
7173         if (i.p)
7174             i.p->addref();
7175         if (p)
7176             p->release();
7177         p = i.p;
7178     }
7179     return *this;
7180 }
7181
7182 Image2D::~Image2D()
7183 {
7184     if (p)
7185         p->release();
7186 }
7187
7188 void* Image2D::ptr() const
7189 {
7190     return p ? p->handle : 0;
7191 }
7192
7193 bool internal::isOpenCLForced()
7194 {
7195     static bool initialized = false;
7196     static bool value = false;
7197     if (!initialized)
7198     {
7199         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7200         initialized = true;
7201     }
7202     return value;
7203 }
7204
7205 bool internal::isPerformanceCheckBypassed()
7206 {
7207     static bool initialized = false;
7208     static bool value = false;
7209     if (!initialized)
7210     {
7211         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7212         initialized = true;
7213     }
7214     return value;
7215 }
7216
7217 bool internal::isCLBuffer(UMat& u)
7218 {
7219     void* h = u.handle(ACCESS_RW);
7220     if (!h)
7221         return true;
7222     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7223 #if 1
7224     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7225         return false;
7226 #else
7227     cl_mem_object_type type = 0;
7228     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7229     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7230         return false;
7231 #endif
7232     return true;
7233 }
7234
7235 struct Timer::Impl
7236 {
7237     const Queue queue;
7238
7239     Impl(const Queue& q)
7240         : queue(q)
7241     {
7242     }
7243
7244     ~Impl(){}
7245
7246     void start()
7247     {
7248         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7249         timer.start();
7250     }
7251
7252     void stop()
7253     {
7254         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7255         timer.stop();
7256     }
7257
7258     uint64 durationNS() const
7259     {
7260         return (uint64)(timer.getTimeSec() * 1e9);
7261     }
7262
7263     TickMeter timer;
7264 };
7265
7266 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7267 Timer::~Timer() { delete p; }
7268
7269 void Timer::start()
7270 {
7271     CV_Assert(p);
7272     p->start();
7273 }
7274
7275 void Timer::stop()
7276 {
7277     CV_Assert(p);
7278     p->stop();
7279 }
7280
7281 uint64 Timer::durationNS() const
7282 {
7283     CV_Assert(p);
7284     return p->durationNS();
7285 }
7286
7287 }} // namespace
7288
7289 #endif // HAVE_OPENCL