Merge pull request #19383 from tomoaki0705:fixWrongIndex
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832 // true if we have initialized OpenCL subsystem with available platforms
833 static bool g_isOpenCVActivated = false;
834
835 bool haveOpenCL()
836 {
837     CV_TRACE_FUNCTION();
838     static bool g_isOpenCLInitialized = false;
839     static bool g_isOpenCLAvailable = false;
840
841     if (!g_isOpenCLInitialized)
842     {
843         CV_TRACE_REGION("Init_OpenCL_Runtime");
844         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
845         if (envPath)
846         {
847             if (cv::String(envPath) == "disabled")
848             {
849                 g_isOpenCLAvailable = false;
850                 g_isOpenCLInitialized = true;
851             }
852         }
853         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
854         try
855         {
856             cl_uint n = 0;
857             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
858             g_isOpenCVActivated = n > 0;
859         }
860         catch (...)
861         {
862             g_isOpenCLAvailable = false;
863         }
864         g_isOpenCLInitialized = true;
865     }
866     return g_isOpenCLAvailable;
867 }
868
869 bool useOpenCL()
870 {
871     CoreTLSData& data = getCoreTlsData();
872     if (data.useOpenCL < 0)
873     {
874         try
875         {
876             data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
877         }
878         catch (...)
879         {
880             data.useOpenCL = 0;
881         }
882     }
883     return data.useOpenCL > 0;
884 }
885
886 bool isOpenCLActivated()
887 {
888     if (!g_isOpenCVActivated)
889         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
890     return useOpenCL();
891 }
892
893 void setUseOpenCL(bool flag)
894 {
895     CV_TRACE_FUNCTION();
896
897     CoreTLSData& data = getCoreTlsData();
898     if (!flag)
899     {
900         data.useOpenCL = 0;
901     }
902     else if( haveOpenCL() )
903     {
904         data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
905     }
906 }
907
908 #ifdef HAVE_CLAMDBLAS
909
910 class AmdBlasHelper
911 {
912 public:
913     static AmdBlasHelper & getInstance()
914     {
915         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
916     }
917
918     bool isAvailable() const
919     {
920         return g_isAmdBlasAvailable;
921     }
922
923     ~AmdBlasHelper()
924     {
925         try
926         {
927             clAmdBlasTeardown();
928         }
929         catch (...) { }
930     }
931
932 protected:
933     AmdBlasHelper()
934     {
935         if (!g_isAmdBlasInitialized)
936         {
937             AutoLock lock(getInitializationMutex());
938
939             if (!g_isAmdBlasInitialized)
940             {
941                 if (haveOpenCL())
942                 {
943                     try
944                     {
945                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
946                     }
947                     catch (...)
948                     {
949                         g_isAmdBlasAvailable = false;
950                     }
951                 }
952                 else
953                     g_isAmdBlasAvailable = false;
954
955                 g_isAmdBlasInitialized = true;
956             }
957         }
958     }
959
960 private:
961     static bool g_isAmdBlasInitialized;
962     static bool g_isAmdBlasAvailable;
963 };
964
965 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
966 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
967
968 bool haveAmdBlas()
969 {
970     return AmdBlasHelper::getInstance().isAvailable();
971 }
972
973 #else
974
975 bool haveAmdBlas()
976 {
977     return false;
978 }
979
980 #endif
981
982 #ifdef HAVE_CLAMDFFT
983
984 class AmdFftHelper
985 {
986 public:
987     static AmdFftHelper & getInstance()
988     {
989         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
990     }
991
992     bool isAvailable() const
993     {
994         return g_isAmdFftAvailable;
995     }
996
997     ~AmdFftHelper()
998     {
999         try
1000         {
1001 //            clAmdFftTeardown();
1002         }
1003         catch (...) { }
1004     }
1005
1006 protected:
1007     AmdFftHelper()
1008     {
1009         if (!g_isAmdFftInitialized)
1010         {
1011             AutoLock lock(getInitializationMutex());
1012
1013             if (!g_isAmdFftInitialized)
1014             {
1015                 if (haveOpenCL())
1016                 {
1017                     try
1018                     {
1019                         cl_uint major, minor, patch;
1020                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1021
1022                         // it throws exception in case AmdFft binaries are not found
1023                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1024                         g_isAmdFftAvailable = true;
1025                     }
1026                     catch (const Exception &)
1027                     {
1028                         g_isAmdFftAvailable = false;
1029                     }
1030                 }
1031                 else
1032                     g_isAmdFftAvailable = false;
1033
1034                 g_isAmdFftInitialized = true;
1035             }
1036         }
1037     }
1038
1039 private:
1040     static clAmdFftSetupData setupData;
1041     static bool g_isAmdFftInitialized;
1042     static bool g_isAmdFftAvailable;
1043 };
1044
1045 clAmdFftSetupData AmdFftHelper::setupData;
1046 bool AmdFftHelper::g_isAmdFftAvailable = false;
1047 bool AmdFftHelper::g_isAmdFftInitialized = false;
1048
1049 bool haveAmdFft()
1050 {
1051     return AmdFftHelper::getInstance().isAvailable();
1052 }
1053
1054 #else
1055
1056 bool haveAmdFft()
1057 {
1058     return false;
1059 }
1060
1061 #endif
1062
1063 bool haveSVM()
1064 {
1065 #ifdef HAVE_OPENCL_SVM
1066     return true;
1067 #else
1068     return false;
1069 #endif
1070 }
1071
1072 void finish()
1073 {
1074     Queue::getDefault().finish();
1075 }
1076
1077 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1078
1079 struct Platform::Impl
1080 {
1081     Impl()
1082     {
1083         refcount = 1;
1084         handle = 0;
1085         initialized = false;
1086     }
1087
1088     ~Impl() {}
1089
1090     void init()
1091     {
1092         if( !initialized )
1093         {
1094             //cl_uint num_entries
1095             cl_uint n = 0;
1096             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1097                 handle = 0;
1098             if( handle != 0 )
1099             {
1100                 char buf[1000];
1101                 size_t len = 0;
1102                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1103                 buf[len] = '\0';
1104                 vendor = String(buf);
1105             }
1106
1107             initialized = true;
1108         }
1109     }
1110
1111     IMPLEMENT_REFCOUNTABLE();
1112
1113     cl_platform_id handle;
1114     String vendor;
1115     bool initialized;
1116 };
1117
1118 Platform::Platform()
1119 {
1120     p = 0;
1121 }
1122
1123 Platform::~Platform()
1124 {
1125     if(p)
1126         p->release();
1127 }
1128
1129 Platform::Platform(const Platform& pl)
1130 {
1131     p = (Impl*)pl.p;
1132     if(p)
1133         p->addref();
1134 }
1135
1136 Platform& Platform::operator = (const Platform& pl)
1137 {
1138     Impl* newp = (Impl*)pl.p;
1139     if(newp)
1140         newp->addref();
1141     if(p)
1142         p->release();
1143     p = newp;
1144     return *this;
1145 }
1146
1147 void* Platform::ptr() const
1148 {
1149     return p ? p->handle : 0;
1150 }
1151
1152 Platform& Platform::getDefault()
1153 {
1154     static Platform p;
1155     if( !p.p )
1156     {
1157         p.p = new Impl;
1158         p.p->init();
1159     }
1160     return p;
1161 }
1162
1163 /////////////////////////////////////// Device ////////////////////////////////////////////
1164
1165 // Version has format:
1166 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1167 // by specification
1168 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1169 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1170 //   https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html
1171 //   https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html
1172 static void parseOpenCLVersion(const String &version, int &major, int &minor)
1173 {
1174     major = minor = 0;
1175     if (10 >= version.length())
1176         return;
1177     const char *pstr = version.c_str();
1178     if (0 != strncmp(pstr, "OpenCL ", 7))
1179         return;
1180     size_t ppos = version.find('.', 7);
1181     if (String::npos == ppos)
1182         return;
1183     String temp = version.substr(7, ppos - 7);
1184     major = atoi(temp.c_str());
1185     temp = version.substr(ppos + 1);
1186     minor = atoi(temp.c_str());
1187 }
1188
1189 struct Device::Impl
1190 {
1191     Impl(void* d)
1192     {
1193         handle = (cl_device_id)d;
1194         refcount = 1;
1195
1196         name_ = getStrProp(CL_DEVICE_NAME);
1197         version_ = getStrProp(CL_DEVICE_VERSION);
1198         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1199         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1200         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1201         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1202         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1203         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1204         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1205         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1206
1207         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1208         parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1209
1210         size_t pos = 0;
1211         while (pos < extensions_.size())
1212         {
1213             size_t pos2 = extensions_.find(' ', pos);
1214             if (pos2 == String::npos)
1215                 pos2 = extensions_.size();
1216             if (pos2 > pos)
1217             {
1218                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1219                 extensions_set_.insert(extensionName);
1220             }
1221             pos = pos2 + 1;
1222         }
1223
1224         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1225
1226         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1227         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1228             vendorName_ == "AMD")
1229             vendorID_ = VENDOR_AMD;
1230         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1231             vendorID_ = VENDOR_INTEL;
1232         else if (vendorName_ == "NVIDIA Corporation")
1233             vendorID_ = VENDOR_NVIDIA;
1234         else
1235             vendorID_ = UNKNOWN_VENDOR;
1236
1237         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1238         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1239         {
1240             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1241             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1242                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1243             maxWorkGroupSize_ = new_maxWorkGroupSize;
1244         }
1245 #if 0
1246         if (isExtensionSupported("cl_khr_spir"))
1247         {
1248 #ifndef CL_DEVICE_SPIR_VERSIONS
1249 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1250 #endif
1251             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1252             std::cout << spir_versions << std::endl;
1253         }
1254 #endif
1255     }
1256
1257     template<typename _TpCL, typename _TpOut>
1258     _TpOut getProp(cl_device_info prop) const
1259     {
1260         _TpCL temp=_TpCL();
1261         size_t sz = 0;
1262
1263         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1264             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1265     }
1266
1267     bool getBoolProp(cl_device_info prop) const
1268     {
1269         cl_bool temp = CL_FALSE;
1270         size_t sz = 0;
1271
1272         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1273             sz == sizeof(temp) ? temp != 0 : false;
1274     }
1275
1276     String getStrProp(cl_device_info prop) const
1277     {
1278         char buf[4096];
1279         size_t sz=0;
1280         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1281             sz < sizeof(buf) ? String(buf) : String();
1282     }
1283
1284     bool isExtensionSupported(const std::string& extensionName) const
1285     {
1286         return extensions_set_.count(extensionName) > 0;
1287     }
1288
1289
1290     IMPLEMENT_REFCOUNTABLE();
1291
1292     cl_device_id handle;
1293
1294     String name_;
1295     String version_;
1296     std::string extensions_;
1297     int doubleFPConfig_;
1298     bool hostUnifiedMemory_;
1299     int maxComputeUnits_;
1300     size_t maxWorkGroupSize_;
1301     int type_;
1302     int addressBits_;
1303     int deviceVersionMajor_;
1304     int deviceVersionMinor_;
1305     String driverVersion_;
1306     String vendorName_;
1307     int vendorID_;
1308     bool intelSubgroupsSupport_;
1309
1310     std::set<std::string> extensions_set_;
1311 };
1312
1313
1314 Device::Device()
1315 {
1316     p = 0;
1317 }
1318
1319 Device::Device(void* d)
1320 {
1321     p = 0;
1322     set(d);
1323 }
1324
1325 Device::Device(const Device& d)
1326 {
1327     p = d.p;
1328     if(p)
1329         p->addref();
1330 }
1331
1332 Device& Device::operator = (const Device& d)
1333 {
1334     Impl* newp = (Impl*)d.p;
1335     if(newp)
1336         newp->addref();
1337     if(p)
1338         p->release();
1339     p = newp;
1340     return *this;
1341 }
1342
1343 Device::~Device()
1344 {
1345     if(p)
1346         p->release();
1347 }
1348
1349 void Device::set(void* d)
1350 {
1351     if(p)
1352         p->release();
1353     p = new Impl(d);
1354 }
1355
1356 void* Device::ptr() const
1357 {
1358     return p ? p->handle : 0;
1359 }
1360
1361 String Device::name() const
1362 { return p ? p->name_ : String(); }
1363
1364 String Device::extensions() const
1365 { return p ? String(p->extensions_) : String(); }
1366
1367 bool Device::isExtensionSupported(const String& extensionName) const
1368 { return p ? p->isExtensionSupported(extensionName) : false; }
1369
1370 String Device::version() const
1371 { return p ? p->version_ : String(); }
1372
1373 String Device::vendorName() const
1374 { return p ? p->vendorName_ : String(); }
1375
1376 int Device::vendorID() const
1377 { return p ? p->vendorID_ : 0; }
1378
1379 String Device::OpenCL_C_Version() const
1380 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1381
1382 String Device::OpenCLVersion() const
1383 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1384
1385 int Device::deviceVersionMajor() const
1386 { return p ? p->deviceVersionMajor_ : 0; }
1387
1388 int Device::deviceVersionMinor() const
1389 { return p ? p->deviceVersionMinor_ : 0; }
1390
1391 String Device::driverVersion() const
1392 { return p ? p->driverVersion_ : String(); }
1393
1394 int Device::type() const
1395 { return p ? p->type_ : 0; }
1396
1397 int Device::addressBits() const
1398 { return p ? p->addressBits_ : 0; }
1399
1400 bool Device::available() const
1401 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1402
1403 bool Device::compilerAvailable() const
1404 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1405
1406 bool Device::linkerAvailable() const
1407 #ifdef CL_VERSION_1_2
1408 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1409 #else
1410 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1411 #endif
1412
1413 int Device::doubleFPConfig() const
1414 { return p ? p->doubleFPConfig_ : 0; }
1415
1416 int Device::singleFPConfig() const
1417 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1418
1419 int Device::halfFPConfig() const
1420 #ifdef CL_VERSION_1_2
1421 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1422 #else
1423 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1424 #endif
1425
1426 bool Device::endianLittle() const
1427 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1428
1429 bool Device::errorCorrectionSupport() const
1430 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1431
1432 int Device::executionCapabilities() const
1433 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1434
1435 size_t Device::globalMemCacheSize() const
1436 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1437
1438 int Device::globalMemCacheType() const
1439 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1440
1441 int Device::globalMemCacheLineSize() const
1442 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1443
1444 size_t Device::globalMemSize() const
1445 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1446
1447 size_t Device::localMemSize() const
1448 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1449
1450 int Device::localMemType() const
1451 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1452
1453 bool Device::hostUnifiedMemory() const
1454 { return p ? p->hostUnifiedMemory_ : false; }
1455
1456 bool Device::imageSupport() const
1457 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1458
1459 bool Device::imageFromBufferSupport() const
1460 {
1461     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1462 }
1463
1464 uint Device::imagePitchAlignment() const
1465 {
1466 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1467     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1468 #else
1469     return 0;
1470 #endif
1471 }
1472
1473 uint Device::imageBaseAddressAlignment() const
1474 {
1475 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1476     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1477 #else
1478     return 0;
1479 #endif
1480 }
1481
1482 size_t Device::image2DMaxWidth() const
1483 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1484
1485 size_t Device::image2DMaxHeight() const
1486 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1487
1488 size_t Device::image3DMaxWidth() const
1489 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1490
1491 size_t Device::image3DMaxHeight() const
1492 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1493
1494 size_t Device::image3DMaxDepth() const
1495 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1496
1497 size_t Device::imageMaxBufferSize() const
1498 #ifdef CL_VERSION_1_2
1499 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1500 #else
1501 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1502 #endif
1503
1504 size_t Device::imageMaxArraySize() const
1505 #ifdef CL_VERSION_1_2
1506 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1507 #else
1508 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1509 #endif
1510
1511 bool Device::intelSubgroupsSupport() const
1512 { return p ? p->intelSubgroupsSupport_ : false; }
1513
1514 int Device::maxClockFrequency() const
1515 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1516
1517 int Device::maxComputeUnits() const
1518 { return p ? p->maxComputeUnits_ : 0; }
1519
1520 int Device::maxConstantArgs() const
1521 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1522
1523 size_t Device::maxConstantBufferSize() const
1524 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1525
1526 size_t Device::maxMemAllocSize() const
1527 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1528
1529 size_t Device::maxParameterSize() const
1530 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1531
1532 int Device::maxReadImageArgs() const
1533 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1534
1535 int Device::maxWriteImageArgs() const
1536 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1537
1538 int Device::maxSamplers() const
1539 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1540
1541 size_t Device::maxWorkGroupSize() const
1542 { return p ? p->maxWorkGroupSize_ : 0; }
1543
1544 int Device::maxWorkItemDims() const
1545 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1546
1547 void Device::maxWorkItemSizes(size_t* sizes) const
1548 {
1549     if(p)
1550     {
1551         const int MAX_DIMS = 32;
1552         size_t retsz = 0;
1553         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1554                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1555     }
1556 }
1557
1558 int Device::memBaseAddrAlign() const
1559 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1560
1561 int Device::nativeVectorWidthChar() const
1562 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1563
1564 int Device::nativeVectorWidthShort() const
1565 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1566
1567 int Device::nativeVectorWidthInt() const
1568 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1569
1570 int Device::nativeVectorWidthLong() const
1571 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1572
1573 int Device::nativeVectorWidthFloat() const
1574 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1575
1576 int Device::nativeVectorWidthDouble() const
1577 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1578
1579 int Device::nativeVectorWidthHalf() const
1580 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1581
1582 int Device::preferredVectorWidthChar() const
1583 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1584
1585 int Device::preferredVectorWidthShort() const
1586 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1587
1588 int Device::preferredVectorWidthInt() const
1589 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1590
1591 int Device::preferredVectorWidthLong() const
1592 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1593
1594 int Device::preferredVectorWidthFloat() const
1595 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1596
1597 int Device::preferredVectorWidthDouble() const
1598 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1599
1600 int Device::preferredVectorWidthHalf() const
1601 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1602
1603 size_t Device::printfBufferSize() const
1604 #ifdef CL_VERSION_1_2
1605 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1606 #else
1607 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1608 #endif
1609
1610
1611 size_t Device::profilingTimerResolution() const
1612 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1613
1614 const Device& Device::getDefault()
1615 {
1616     const Context& ctx = Context::getDefault();
1617     int idx = getCoreTlsData().device;
1618     const Device& device = ctx.device(idx);
1619     return device;
1620 }
1621
1622 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1623
1624 template <typename Functor, typename ObjectType>
1625 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
1626 {
1627     ::size_t required;
1628     cl_int err = f(obj, name, 0, NULL, &required);
1629     if (err != CL_SUCCESS)
1630         return err;
1631
1632     param.clear();
1633     if (required > 0)
1634     {
1635         AutoBuffer<char> buf(required + 1);
1636         char* ptr = buf.data(); // cleanup is not needed
1637         err = f(obj, name, required, ptr, NULL);
1638         if (err != CL_SUCCESS)
1639             return err;
1640         param = ptr;
1641     }
1642
1643     return CL_SUCCESS;
1644 }
1645
1646 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
1647 {
1648     elems.clear();
1649     if (s.size() == 0)
1650         return;
1651     std::istringstream ss(s);
1652     std::string item;
1653     while (!ss.eof())
1654     {
1655         std::getline(ss, item, delim);
1656         elems.push_back(item);
1657     }
1658 }
1659
1660 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
1661 // Sample: AMD:GPU:
1662 // Sample: AMD:GPU:Tahiti
1663 // Sample: :GPU|CPU: = '' = ':' = '::'
1664 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
1665         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
1666 {
1667     std::vector<std::string> parts;
1668     split(configurationStr, ':', parts);
1669     if (parts.size() > 3)
1670     {
1671         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
1672         return false;
1673     }
1674     if (parts.size() > 2)
1675         deviceNameOrID = parts[2];
1676     if (parts.size() > 1)
1677     {
1678         split(parts[1], '|', deviceTypes);
1679     }
1680     if (parts.size() > 0)
1681     {
1682         platform = parts[0];
1683     }
1684     return true;
1685 }
1686
1687 #if defined WINRT || defined _WIN32_WCE
1688 static cl_device_id selectOpenCLDevice()
1689 {
1690     return NULL;
1691 }
1692 #else
1693 // std::tolower is int->int
1694 static char char_tolower(char ch)
1695 {
1696     return (char)std::tolower((int)ch);
1697 }
1698 static cl_device_id selectOpenCLDevice()
1699 {
1700     std::string platform, deviceName;
1701     std::vector<std::string> deviceTypes;
1702
1703     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
1704     if (configuration &&
1705             (strcmp(configuration, "disabled") == 0 ||
1706              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
1707             ))
1708         return NULL;
1709
1710     bool isID = false;
1711     int deviceID = -1;
1712     if (deviceName.length() == 1)
1713     // We limit ID range to 0..9, because we want to write:
1714     // - '2500' to mean i5-2500
1715     // - '8350' to mean AMD FX-8350
1716     // - '650' to mean GeForce 650
1717     // To extend ID range change condition to '> 0'
1718     {
1719         isID = true;
1720         for (size_t i = 0; i < deviceName.length(); i++)
1721         {
1722             if (!isdigit(deviceName[i]))
1723             {
1724                 isID = false;
1725                 break;
1726             }
1727         }
1728         if (isID)
1729         {
1730             deviceID = atoi(deviceName.c_str());
1731             if (deviceID < 0)
1732                 return NULL;
1733         }
1734     }
1735
1736     std::vector<cl_platform_id> platforms;
1737     {
1738         cl_uint numPlatforms = 0;
1739         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
1740
1741         if (numPlatforms == 0)
1742             return NULL;
1743         platforms.resize((size_t)numPlatforms);
1744         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
1745         platforms.resize(numPlatforms);
1746     }
1747
1748     int selectedPlatform = -1;
1749     if (platform.length() > 0)
1750     {
1751         for (size_t i = 0; i < platforms.size(); i++)
1752         {
1753             std::string name;
1754             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
1755             if (name.find(platform) != std::string::npos)
1756             {
1757                 selectedPlatform = (int)i;
1758                 break;
1759             }
1760         }
1761         if (selectedPlatform == -1)
1762         {
1763             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
1764             goto not_found;
1765         }
1766     }
1767     if (deviceTypes.size() == 0)
1768     {
1769         if (!isID)
1770         {
1771             deviceTypes.push_back("GPU");
1772             if (configuration)
1773                 deviceTypes.push_back("CPU");
1774         }
1775         else
1776             deviceTypes.push_back("ALL");
1777     }
1778     for (size_t t = 0; t < deviceTypes.size(); t++)
1779     {
1780         int deviceType = 0;
1781         std::string tempStrDeviceType = deviceTypes[t];
1782         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
1783
1784         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1785             deviceType = Device::TYPE_GPU;
1786         else if (tempStrDeviceType == "cpu")
1787             deviceType = Device::TYPE_CPU;
1788         else if (tempStrDeviceType == "accelerator")
1789             deviceType = Device::TYPE_ACCELERATOR;
1790         else if (tempStrDeviceType == "all")
1791             deviceType = Device::TYPE_ALL;
1792         else
1793         {
1794             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
1795             goto not_found;
1796         }
1797
1798         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
1799         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
1800                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
1801                 i++)
1802         {
1803             cl_uint count = 0;
1804             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
1805             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1806             {
1807                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
1808             }
1809             if (count == 0)
1810                 continue;
1811             size_t base = devices.size();
1812             devices.resize(base + count);
1813             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
1814             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1815             {
1816                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
1817             }
1818         }
1819
1820         for (size_t i = (isID ? deviceID : 0);
1821              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
1822              i++)
1823         {
1824             std::string name;
1825             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
1826             cl_bool useGPU = true;
1827             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1828             {
1829                 cl_bool isIGPU = CL_FALSE;
1830                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
1831                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
1832             }
1833             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
1834             {
1835                 // TODO check for OpenCL 1.1
1836                 return devices[i];
1837             }
1838         }
1839     }
1840
1841 not_found:
1842     if (!configuration)
1843         return NULL; // suppress messages on stderr
1844
1845     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
1846             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
1847             << "    Device types: ";
1848     for (size_t t = 0; t < deviceTypes.size(); t++)
1849         std::cerr << deviceTypes[t] << " ";
1850
1851     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
1852     return NULL;
1853 }
1854 #endif
1855
1856 #ifdef HAVE_OPENCL_SVM
1857 namespace svm {
1858
1859 enum AllocatorFlags { // don't use first 16 bits
1860         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
1861         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
1862         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
1863         OPENCL_SVM_BUFFER_MASK = 3 << 16,
1864         OPENCL_SVM_BUFFER_MAP = 4 << 16
1865 };
1866
1867 static bool checkForceSVMUmatUsage()
1868 {
1869     static bool initialized = false;
1870     static bool force = false;
1871     if (!initialized)
1872     {
1873         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
1874         initialized = true;
1875     }
1876     return force;
1877 }
1878 static bool checkDisableSVMUMatUsage()
1879 {
1880     static bool initialized = false;
1881     static bool force = false;
1882     if (!initialized)
1883     {
1884         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
1885         initialized = true;
1886     }
1887     return force;
1888 }
1889 static bool checkDisableSVM()
1890 {
1891     static bool initialized = false;
1892     static bool force = false;
1893     if (!initialized)
1894     {
1895         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
1896         initialized = true;
1897     }
1898     return force;
1899 }
1900 // see SVMCapabilities
1901 static unsigned int getSVMCapabilitiesMask()
1902 {
1903     static bool initialized = false;
1904     static unsigned int mask = 0;
1905     if (!initialized)
1906     {
1907         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
1908         if (envValue == NULL)
1909         {
1910             return ~0U; // all bits 1
1911         }
1912         mask = atoi(envValue);
1913         initialized = true;
1914     }
1915     return mask;
1916 }
1917 } // namespace
1918 #endif
1919
1920 static size_t getProgramCountLimit()
1921 {
1922     static bool initialized = false;
1923     static size_t count = 0;
1924     if (!initialized)
1925     {
1926         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
1927         initialized = true;
1928     }
1929     return count;
1930 }
1931
1932 struct Context::Impl
1933 {
1934     static Context::Impl* get(Context& context) { return context.p; }
1935
1936     void __init()
1937     {
1938         refcount = 1;
1939         handle = 0;
1940 #ifdef HAVE_OPENCL_SVM
1941         svmInitialized = false;
1942 #endif
1943     }
1944
1945     Impl()
1946     {
1947         __init();
1948     }
1949
1950     void setDefault()
1951     {
1952         CV_Assert(handle == NULL);
1953
1954         cl_device_id d = selectOpenCLDevice();
1955
1956         if (d == NULL)
1957             return;
1958
1959         cl_platform_id pl = NULL;
1960         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
1961
1962         cl_context_properties prop[] =
1963         {
1964             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1965             0
1966         };
1967
1968         // !!! in the current implementation force the number of devices to 1 !!!
1969         cl_uint nd = 1;
1970         cl_int status;
1971
1972         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
1973         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
1974
1975         bool ok = handle != 0 && status == CL_SUCCESS;
1976         if( ok )
1977         {
1978             devices.resize(nd);
1979             devices[0].set(d);
1980         }
1981         else
1982             handle = NULL;
1983     }
1984
1985     Impl(int dtype0)
1986     {
1987         __init();
1988
1989         cl_int retval = 0;
1990         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
1991         cl_context_properties prop[] =
1992         {
1993             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1994             0
1995         };
1996
1997         cl_uint nd0 = 0;
1998         int dtype = dtype0 & 15;
1999         cl_int status = clGetDeviceIDs(pl, dtype, 0, NULL, &nd0);
2000         if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
2001         {
2002             CV_OCL_DBG_CHECK_RESULT(status,
2003                 cv::format("clGetDeviceIDs(platform=%p, device_type=%d, num_entries=0, devices=NULL, numDevices=%p)", pl, dtype, &nd0).c_str());
2004         }
2005
2006         if (nd0 == 0)
2007             return;
2008
2009         AutoBuffer<void*> dlistbuf(nd0*2+1);
2010         cl_device_id* dlist = (cl_device_id*)dlistbuf.data();
2011         cl_device_id* dlist_new = dlist + nd0;
2012         CV_OCL_DBG_CHECK(clGetDeviceIDs(pl, dtype, nd0, dlist, &nd0));
2013
2014         cl_uint i, nd = 0;
2015         String name0;
2016         for(i = 0; i < nd0; i++)
2017         {
2018             Device d(dlist[i]);
2019             if( !d.available() || !d.compilerAvailable() )
2020                 continue;
2021             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2022                 continue;
2023             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2024                 continue;
2025             String name = d.name();
2026             if( nd != 0 && name != name0 )
2027                 continue;
2028             name0 = name;
2029             dlist_new[nd++] = dlist[i];
2030         }
2031
2032         if(nd == 0)
2033             return;
2034
2035         // !!! in the current implementation force the number of devices to 1 !!!
2036         nd = 1;
2037
2038         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2039         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateContext");
2040         bool ok = handle != 0 && retval == CL_SUCCESS;
2041         if( ok )
2042         {
2043             devices.resize(nd);
2044             for( i = 0; i < nd; i++ )
2045                 devices[i].set(dlist_new[i]);
2046         }
2047     }
2048
2049     ~Impl()
2050     {
2051         if(handle)
2052         {
2053             CV_OCL_DBG_CHECK(clReleaseContext(handle));
2054             handle = NULL;
2055         }
2056         devices.clear();
2057     }
2058
2059     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2060
2061     void unloadProg(Program& prog)
2062     {
2063         cv::AutoLock lock(program_cache_mutex);
2064         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2065         {
2066               phash_t::iterator it = phash.find(*i);
2067               if (it != phash.end())
2068               {
2069                   if (it->second.ptr() == prog.ptr())
2070                   {
2071                       phash.erase(*i);
2072                       cacheList.erase(i);
2073                       return;
2074                   }
2075               }
2076         }
2077     }
2078
2079     std::string& getPrefixString()
2080     {
2081         if (prefix.empty())
2082         {
2083             cv::AutoLock lock(program_cache_mutex);
2084             if (prefix.empty())
2085             {
2086                 CV_Assert(!devices.empty());
2087                 const Device& d = devices[0];
2088                 int bits = d.addressBits();
2089                 if (bits > 0 && bits != 64)
2090                     prefix = cv::format("%d-bit--", bits);
2091                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2092                 // sanitize chars
2093                 for (size_t i = 0; i < prefix.size(); i++)
2094                 {
2095                     char c = prefix[i];
2096                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2097                     {
2098                         prefix[i] = '_';
2099                     }
2100                 }
2101             }
2102         }
2103         return prefix;
2104     }
2105
2106     std::string& getPrefixBase()
2107     {
2108         if (prefix_base.empty())
2109         {
2110             cv::AutoLock lock(program_cache_mutex);
2111             if (prefix_base.empty())
2112             {
2113                 const Device& d = devices[0];
2114                 int bits = d.addressBits();
2115                 if (bits > 0 && bits != 64)
2116                     prefix_base = cv::format("%d-bit--", bits);
2117                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2118                 // sanitize chars
2119                 for (size_t i = 0; i < prefix_base.size(); i++)
2120                 {
2121                     char c = prefix_base[i];
2122                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2123                     {
2124                         prefix_base[i] = '_';
2125                     }
2126                 }
2127             }
2128         }
2129         return prefix_base;
2130     }
2131
2132     IMPLEMENT_REFCOUNTABLE();
2133
2134     cl_context handle;
2135     std::vector<Device> devices;
2136
2137     std::string prefix;
2138     std::string prefix_base;
2139
2140     cv::Mutex program_cache_mutex;
2141     typedef std::map<std::string, Program> phash_t;
2142     phash_t phash;
2143     typedef std::list<cv::String> CacheList;
2144     CacheList cacheList;
2145
2146 #ifdef HAVE_OPENCL_SVM
2147     bool svmInitialized;
2148     bool svmAvailable;
2149     bool svmEnabled;
2150     svm::SVMCapabilities svmCapabilities;
2151     svm::SVMFunctions svmFunctions;
2152
2153     void svmInit()
2154     {
2155         CV_Assert(handle != NULL);
2156         const Device& device = devices[0];
2157         cl_device_svm_capabilities deviceCaps = 0;
2158         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2159         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2160         if (status != CL_SUCCESS)
2161         {
2162             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2163             goto noSVM;
2164         }
2165         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2166         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2167         svmCapabilities.value_ =
2168                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2169                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2170                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2171                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2172         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2173         if (svmCapabilities.value_ == 0)
2174         {
2175             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2176             goto noSVM;
2177         }
2178         try
2179         {
2180             // Try OpenCL 2.0
2181             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2182             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2183             if (!ptr)
2184             {
2185                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2186                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2187             }
2188             try
2189             {
2190                 bool error = false;
2191                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2192                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2193                 {
2194                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2195                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2196                 }
2197                 clFinish(q);
2198                 try
2199                 {
2200                     ((int*)ptr)[0] = 100;
2201                 }
2202                 catch (...)
2203                 {
2204                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2205                     error = true;
2206                 }
2207                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2208                 {
2209                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2210                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2211                 }
2212                 clFinish(q);
2213                 if (error)
2214                 {
2215                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2216                 }
2217             }
2218             catch (...)
2219             {
2220                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2221                 clSVMFree(handle, ptr);
2222                 throw;
2223             }
2224             clSVMFree(handle, ptr);
2225             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2226             svmFunctions.fn_clSVMFree = clSVMFree;
2227             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2228             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2229             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2230             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2231             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2232             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2233             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2234         }
2235         catch (...)
2236         {
2237             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2238             try
2239             {
2240                 // Try HSA extension
2241                 String extensions = device.extensions();
2242                 if (extensions.find("cl_amd_svm") == String::npos)
2243                 {
2244                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2245                     goto noSVM;
2246                 }
2247                 cl_platform_id p = NULL;
2248                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2249                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2250                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2251                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2252                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2253                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2254                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2255                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2256                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2257                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2258                 CV_Assert(svmFunctions.isValid());
2259             }
2260             catch (...)
2261             {
2262                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2263                 goto noSVM;
2264             }
2265         }
2266
2267         svmAvailable = true;
2268         svmEnabled = !svm::checkDisableSVM();
2269         svmInitialized = true;
2270         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2271         return;
2272     noSVM:
2273         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2274         svmAvailable = false;
2275         svmEnabled = false;
2276         svmCapabilities.value_ = 0;
2277         svmInitialized = true;
2278         svmFunctions.fn_clSVMAlloc = NULL;
2279         return;
2280     }
2281 #endif
2282
2283     friend class Program;
2284 };
2285
2286
2287 Context::Context()
2288 {
2289     p = 0;
2290 }
2291
2292 Context::Context(int dtype)
2293 {
2294     p = 0;
2295     create(dtype);
2296 }
2297
2298 bool Context::create()
2299 {
2300     if( !haveOpenCL() )
2301         return false;
2302     if(p)
2303         p->release();
2304     p = new Impl();
2305     if(!p->handle)
2306     {
2307         delete p;
2308         p = 0;
2309     }
2310     return p != 0;
2311 }
2312
2313 bool Context::create(int dtype0)
2314 {
2315     if( !haveOpenCL() )
2316         return false;
2317     if(p)
2318         p->release();
2319     p = new Impl(dtype0);
2320     if(!p->handle)
2321     {
2322         delete p;
2323         p = 0;
2324     }
2325     return p != 0;
2326 }
2327
2328 Context::~Context()
2329 {
2330     if (p)
2331     {
2332         p->release();
2333         p = NULL;
2334     }
2335 }
2336
2337 Context::Context(const Context& c)
2338 {
2339     p = (Impl*)c.p;
2340     if(p)
2341         p->addref();
2342 }
2343
2344 Context& Context::operator = (const Context& c)
2345 {
2346     Impl* newp = (Impl*)c.p;
2347     if(newp)
2348         newp->addref();
2349     if(p)
2350         p->release();
2351     p = newp;
2352     return *this;
2353 }
2354
2355 void* Context::ptr() const
2356 {
2357     return p == NULL ? NULL : p->handle;
2358 }
2359
2360 size_t Context::ndevices() const
2361 {
2362     return p ? p->devices.size() : 0;
2363 }
2364
2365 const Device& Context::device(size_t idx) const
2366 {
2367     static Device dummy;
2368     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2369 }
2370
2371 Context& Context::getDefault(bool initialize)
2372 {
2373     static Context* ctx = new Context();
2374     if(!ctx->p && haveOpenCL())
2375     {
2376         if (!ctx->p)
2377             ctx->p = new Impl();
2378         if (initialize)
2379         {
2380             // do not create new Context right away.
2381             // First, try to retrieve existing context of the same type.
2382             // In its turn, Platform::getContext() may call Context::create()
2383             // if there is no such context.
2384             if (ctx->p->handle == NULL)
2385                 ctx->p->setDefault();
2386         }
2387     }
2388
2389     return *ctx;
2390 }
2391
2392 Program Context::getProg(const ProgramSource& prog,
2393                          const String& buildopts, String& errmsg)
2394 {
2395     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2396 }
2397
2398 void Context::unloadProg(Program& prog)
2399 {
2400     if (p)
2401         p->unloadProg(prog);
2402 }
2403
2404 #ifdef HAVE_OPENCL_SVM
2405 bool Context::useSVM() const
2406 {
2407     Context::Impl* i = p;
2408     CV_Assert(i);
2409     if (!i->svmInitialized)
2410         i->svmInit();
2411     return i->svmEnabled;
2412 }
2413 void Context::setUseSVM(bool enabled)
2414 {
2415     Context::Impl* i = p;
2416     CV_Assert(i);
2417     if (!i->svmInitialized)
2418         i->svmInit();
2419     if (enabled && !i->svmAvailable)
2420     {
2421         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2422     }
2423     i->svmEnabled = enabled;
2424 }
2425 #else
2426 bool Context::useSVM() const { return false; }
2427 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2428 #endif
2429
2430 #ifdef HAVE_OPENCL_SVM
2431 namespace svm {
2432
2433 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2434 {
2435     Context::Impl* i = context.p;
2436     CV_Assert(i);
2437     if (!i->svmInitialized)
2438         i->svmInit();
2439     return i->svmCapabilities;
2440 }
2441
2442 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2443 {
2444     Context::Impl* i = context.p;
2445     CV_Assert(i);
2446     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2447     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2448     return &i->svmFunctions;
2449 }
2450
2451 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2452 {
2453     if (checkForceSVMUmatUsage())
2454         return true;
2455     if (checkDisableSVMUMatUsage())
2456         return false;
2457     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2458         return true;
2459     return false; // don't use SVM by default
2460 }
2461
2462 } // namespace cv::ocl::svm
2463 #endif // HAVE_OPENCL_SVM
2464
2465
2466 static void get_platform_name(cl_platform_id id, String& name)
2467 {
2468     // get platform name string length
2469     size_t sz = 0;
2470     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
2471
2472     // get platform name string
2473     AutoBuffer<char> buf(sz + 1);
2474     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
2475
2476     // just in case, ensure trailing zero for ASCIIZ string
2477     buf[sz] = 0;
2478
2479     name = buf.data();
2480 }
2481
2482 /*
2483 // Attaches OpenCL context to OpenCV
2484 */
2485 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
2486 {
2487     cl_uint cnt = 0;
2488
2489     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
2490
2491     if (cnt == 0)
2492         CV_Error(cv::Error::OpenCLApiCallError, "no OpenCL platform available!");
2493
2494     std::vector<cl_platform_id> platforms(cnt);
2495
2496     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
2497
2498     bool platformAvailable = false;
2499
2500     // check if external platformName contained in list of available platforms in OpenCV
2501     for (unsigned int i = 0; i < cnt; i++)
2502     {
2503         String availablePlatformName;
2504         get_platform_name(platforms[i], availablePlatformName);
2505         // external platform is found in the list of available platforms
2506         if (platformName == availablePlatformName)
2507         {
2508             platformAvailable = true;
2509             break;
2510         }
2511     }
2512
2513     if (!platformAvailable)
2514         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2515
2516     // check if platformID corresponds to platformName
2517     String actualPlatformName;
2518     get_platform_name((cl_platform_id)platformID, actualPlatformName);
2519     if (platformName != actualPlatformName)
2520         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2521
2522     // do not initialize OpenCL context
2523     Context ctx = Context::getDefault(false);
2524
2525     // attach supplied context to OpenCV
2526     initializeContextFromHandle(ctx, platformID, context, deviceID);
2527
2528     CV_OCL_CHECK(clRetainContext((cl_context)context));
2529
2530     // clear command queue, if any
2531     CoreTLSData& data = getCoreTlsData();
2532     data.oclQueue.finish();
2533     Queue q;
2534     data.oclQueue = q;
2535
2536     return;
2537 } // attachContext()
2538
2539
2540 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2541 {
2542     cl_context context = (cl_context)_context;
2543     cl_device_id device = (cl_device_id)_device;
2544
2545     // cleanup old context
2546     Context::Impl * impl = ctx.p;
2547     if (impl->handle)
2548     {
2549         CV_OCL_DBG_CHECK(clReleaseContext(impl->handle));
2550     }
2551     impl->devices.clear();
2552
2553     impl->handle = context;
2554     impl->devices.resize(1);
2555     impl->devices[0].set(device);
2556
2557     Platform& p = Platform::getDefault();
2558     Platform::Impl* pImpl = p.p;
2559     pImpl->handle = (cl_platform_id)platform;
2560 }
2561
2562 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2563
2564 struct Queue::Impl
2565 {
2566     inline void __init()
2567     {
2568         refcount = 1;
2569         handle = 0;
2570         isProfilingQueue_ = false;
2571     }
2572
2573     Impl(cl_command_queue q)
2574     {
2575         __init();
2576         handle = q;
2577
2578         cl_command_queue_properties props = 0;
2579         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
2580         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
2581     }
2582
2583     Impl(cl_command_queue q, bool isProfilingQueue)
2584     {
2585         __init();
2586         handle = q;
2587         isProfilingQueue_ = isProfilingQueue;
2588     }
2589
2590     Impl(const Context& c, const Device& d, bool withProfiling = false)
2591     {
2592         __init();
2593
2594         const Context* pc = &c;
2595         cl_context ch = (cl_context)pc->ptr();
2596         if( !ch )
2597         {
2598             pc = &Context::getDefault();
2599             ch = (cl_context)pc->ptr();
2600         }
2601         cl_device_id dh = (cl_device_id)d.ptr();
2602         if( !dh )
2603             dh = (cl_device_id)pc->device(0).ptr();
2604         cl_int retval = 0;
2605         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
2606         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
2607         isProfilingQueue_ = withProfiling;
2608     }
2609
2610     ~Impl()
2611     {
2612 #ifdef _WIN32
2613         if (!cv::__termination)
2614 #endif
2615         {
2616             if(handle)
2617             {
2618                 CV_OCL_DBG_CHECK(clFinish(handle));
2619                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
2620                 handle = NULL;
2621             }
2622         }
2623     }
2624
2625     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
2626     {
2627         if (isProfilingQueue_)
2628             return self;
2629
2630         if (profiling_queue_.ptr())
2631             return profiling_queue_;
2632
2633         cl_context ctx = 0;
2634         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
2635
2636         cl_device_id device = 0;
2637         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
2638
2639         cl_int result = CL_SUCCESS;
2640         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
2641         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
2642         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
2643
2644         Queue queue;
2645         queue.p = new Impl(q, true);
2646         profiling_queue_ = queue;
2647
2648         return profiling_queue_;
2649     }
2650
2651     IMPLEMENT_REFCOUNTABLE();
2652
2653     cl_command_queue handle;
2654     bool isProfilingQueue_;
2655     cv::ocl::Queue profiling_queue_;
2656 };
2657
2658 Queue::Queue()
2659 {
2660     p = 0;
2661 }
2662
2663 Queue::Queue(const Context& c, const Device& d)
2664 {
2665     p = 0;
2666     create(c, d);
2667 }
2668
2669 Queue::Queue(const Queue& q)
2670 {
2671     p = q.p;
2672     if(p)
2673         p->addref();
2674 }
2675
2676 Queue& Queue::operator = (const Queue& q)
2677 {
2678     Impl* newp = (Impl*)q.p;
2679     if(newp)
2680         newp->addref();
2681     if(p)
2682         p->release();
2683     p = newp;
2684     return *this;
2685 }
2686
2687 Queue::~Queue()
2688 {
2689     if(p)
2690         p->release();
2691 }
2692
2693 bool Queue::create(const Context& c, const Device& d)
2694 {
2695     if(p)
2696         p->release();
2697     p = new Impl(c, d);
2698     return p->handle != 0;
2699 }
2700
2701 void Queue::finish()
2702 {
2703     if(p && p->handle)
2704     {
2705         CV_OCL_DBG_CHECK(clFinish(p->handle));
2706     }
2707 }
2708
2709 const Queue& Queue::getProfilingQueue() const
2710 {
2711     CV_Assert(p);
2712     return p->getProfilingQueue(*this);
2713 }
2714
2715 void* Queue::ptr() const
2716 {
2717     return p ? p->handle : 0;
2718 }
2719
2720 Queue& Queue::getDefault()
2721 {
2722     Queue& q = getCoreTlsData().oclQueue;
2723     if( !q.p && haveOpenCL() )
2724         q.create(Context::getDefault());
2725     return q;
2726 }
2727
2728 static cl_command_queue getQueue(const Queue& q)
2729 {
2730     cl_command_queue qq = (cl_command_queue)q.ptr();
2731     if(!qq)
2732         qq = (cl_command_queue)Queue::getDefault().ptr();
2733     return qq;
2734 }
2735
2736 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2737
2738 KernelArg::KernelArg()
2739     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2740 {
2741 }
2742
2743 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2744     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2745 {
2746     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
2747 }
2748
2749 KernelArg KernelArg::Constant(const Mat& m)
2750 {
2751     CV_Assert(m.isContinuous());
2752     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
2753 }
2754
2755 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2756
2757 struct Kernel::Impl
2758 {
2759     Impl(const char* kname, const Program& prog) :
2760         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
2761     {
2762         cl_program ph = (cl_program)prog.ptr();
2763         cl_int retval = 0;
2764         name = kname;
2765         if (ph)
2766         {
2767             handle = clCreateKernel(ph, kname, &retval);
2768             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
2769         }
2770         for( int i = 0; i < MAX_ARRS; i++ )
2771             u[i] = 0;
2772         haveTempDstUMats = false;
2773         haveTempSrcUMats = false;
2774     }
2775
2776     void cleanupUMats()
2777     {
2778         for( int i = 0; i < MAX_ARRS; i++ )
2779             if( u[i] )
2780             {
2781                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2782                 {
2783                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
2784                     u[i]->currAllocator->deallocate(u[i]);
2785                 }
2786                 u[i] = 0;
2787             }
2788         nu = 0;
2789         haveTempDstUMats = false;
2790         haveTempSrcUMats = false;
2791     }
2792
2793     void addUMat(const UMat& m, bool dst)
2794     {
2795         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2796         u[nu] = m.u;
2797         CV_XADD(&m.u->urefcount, 1);
2798         nu++;
2799         if(dst && m.u->tempUMat())
2800             haveTempDstUMats = true;
2801         if(m.u->originalUMatData == NULL && m.u->tempUMat())
2802             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
2803     }
2804
2805     /// Preserve image lifetime (while it is specified as Kernel argument)
2806     void registerImageArgument(int arg, const Image2D& image)
2807     {
2808         CV_CheckGE(arg, 0, "");
2809         CV_CheckLT(arg, (int)MAX_ARRS, "");
2810         if (arg < (int)shadow_images.size() && shadow_images[arg].ptr() != image.ptr())  // TODO future: replace ptr => impl (more strong check)
2811         {
2812             CV_Check(arg, !isInProgress, "ocl::Kernel: clearing of pending Image2D arguments is not allowed");
2813         }
2814         shadow_images.reserve(MAX_ARRS);
2815         shadow_images.resize(std::max(shadow_images.size(), (size_t)arg + 1));
2816         shadow_images[arg] = image;
2817     }
2818
2819     void finit(cl_event e)
2820     {
2821         CV_UNUSED(e);
2822         cleanupUMats();
2823         isInProgress = false;
2824         release();
2825     }
2826
2827     bool run(int dims, size_t _globalsize[], size_t _localsize[],
2828             bool sync, int64* timeNS, const Queue& q);
2829
2830     ~Impl()
2831     {
2832         if(handle)
2833         {
2834             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
2835         }
2836     }
2837
2838     IMPLEMENT_REFCOUNTABLE();
2839
2840     cv::String name;
2841     cl_kernel handle;
2842     enum { MAX_ARRS = 16 };
2843     UMatData* u[MAX_ARRS];
2844     bool isInProgress;
2845     bool isAsyncRun;  // true if kernel was scheduled in async mode
2846     int nu;
2847     std::vector<Image2D> shadow_images;
2848     bool haveTempDstUMats;
2849     bool haveTempSrcUMats;
2850 };
2851
2852 }} // namespace cv::ocl
2853
2854 extern "C" {
2855
2856 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
2857 {
2858     try
2859     {
2860         ((cv::ocl::Kernel::Impl*)p)->finit(e);
2861     }
2862     catch (const cv::Exception& exc)
2863     {
2864         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
2865     }
2866     catch (const std::exception& exc)
2867     {
2868         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
2869     }
2870     catch (...)
2871     {
2872         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
2873     }
2874 }
2875
2876 }
2877
2878 namespace cv { namespace ocl {
2879
2880 Kernel::Kernel()
2881 {
2882     p = 0;
2883 }
2884
2885 Kernel::Kernel(const char* kname, const Program& prog)
2886 {
2887     p = 0;
2888     create(kname, prog);
2889 }
2890
2891 Kernel::Kernel(const char* kname, const ProgramSource& src,
2892                const String& buildopts, String* errmsg)
2893 {
2894     p = 0;
2895     create(kname, src, buildopts, errmsg);
2896 }
2897
2898 Kernel::Kernel(const Kernel& k)
2899 {
2900     p = k.p;
2901     if(p)
2902         p->addref();
2903 }
2904
2905 Kernel& Kernel::operator = (const Kernel& k)
2906 {
2907     Impl* newp = (Impl*)k.p;
2908     if(newp)
2909         newp->addref();
2910     if(p)
2911         p->release();
2912     p = newp;
2913     return *this;
2914 }
2915
2916 Kernel::~Kernel()
2917 {
2918     if(p)
2919         p->release();
2920 }
2921
2922 bool Kernel::create(const char* kname, const Program& prog)
2923 {
2924     if(p)
2925         p->release();
2926     p = new Impl(kname, prog);
2927     if(p->handle == 0)
2928     {
2929         p->release();
2930         p = 0;
2931     }
2932 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
2933     CV_Assert(p);
2934 #endif
2935     return p != 0;
2936 }
2937
2938 bool Kernel::create(const char* kname, const ProgramSource& src,
2939                     const String& buildopts, String* errmsg)
2940 {
2941     if(p)
2942     {
2943         p->release();
2944         p = 0;
2945     }
2946     String tempmsg;
2947     if( !errmsg ) errmsg = &tempmsg;
2948     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2949     return create(kname, prog);
2950 }
2951
2952 void* Kernel::ptr() const
2953 {
2954     return p ? p->handle : 0;
2955 }
2956
2957 bool Kernel::empty() const
2958 {
2959     return ptr() == 0;
2960 }
2961
2962 static cv::String dumpValue(size_t sz, const void* p)
2963 {
2964     if (sz == 4)
2965         return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
2966     if (sz == 8)
2967         return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p);
2968     return cv::format("%p", p);
2969 }
2970
2971 int Kernel::set(int i, const void* value, size_t sz)
2972 {
2973     if (!p || !p->handle)
2974         return -1;
2975     if (i < 0)
2976         return i;
2977     if( i == 0 )
2978         p->cleanupUMats();
2979
2980     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2981     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str());
2982     if (retval != CL_SUCCESS)
2983         return -1;
2984     return i+1;
2985 }
2986
2987 int Kernel::set(int i, const Image2D& image2D)
2988 {
2989     cl_mem h = (cl_mem)image2D.ptr();
2990     int res = set(i, &h, sizeof(h));
2991     if (res >= 0)
2992         p->registerImageArgument(i, image2D);
2993     return res;
2994 }
2995
2996 int Kernel::set(int i, const UMat& m)
2997 {
2998     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
2999 }
3000
3001 int Kernel::set(int i, const KernelArg& arg)
3002 {
3003     if( !p || !p->handle )
3004         return -1;
3005     if (i < 0)
3006     {
3007         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3008                 p->name.c_str(), (int)i));
3009         return i;
3010     }
3011     if( i == 0 )
3012         p->cleanupUMats();
3013     cl_int status = 0;
3014     if( arg.m )
3015     {
3016         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
3017                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
3018         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3019         if (ptronly && arg.m->empty())
3020         {
3021             cl_mem h_null = (cl_mem)NULL;
3022             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3023             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3024             return i + 1;
3025         }
3026         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3027
3028         if (!h)
3029         {
3030             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3031                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3032             p->release();
3033             p = 0;
3034             return -1;
3035         }
3036
3037 #ifdef HAVE_OPENCL_SVM
3038         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3039         {
3040             const Context& ctx = Context::getDefault();
3041             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3042             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3043             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3044 #if 1 // TODO
3045             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3046 #else
3047             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3048 #endif
3049             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3050         }
3051         else
3052 #endif
3053         {
3054             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3055             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3056         }
3057
3058         if (ptronly)
3059         {
3060             i++;
3061         }
3062         else if( arg.m->dims <= 2 )
3063         {
3064             UMat2D u2d(*arg.m);
3065             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3066             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3067             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3068             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3069             i += 3;
3070
3071             if( !(arg.flags & KernelArg::NO_SIZE) )
3072             {
3073                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3074                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3075                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3076                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3077                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3078                 i += 2;
3079             }
3080         }
3081         else
3082         {
3083             UMat3D u3d(*arg.m);
3084             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3085             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3086             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3087             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3088             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3089             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3090             i += 4;
3091             if( !(arg.flags & KernelArg::NO_SIZE) )
3092             {
3093                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3094                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3095                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3096                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3097                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3098                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3099                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3100                 i += 3;
3101             }
3102         }
3103         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3104         return i;
3105     }
3106     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3107     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3108     return i+1;
3109 }
3110
3111 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3112                  bool sync, const Queue& q)
3113 {
3114     if (!p)
3115         return false;
3116
3117     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3118     size_t total = 1;
3119     CV_Assert(_globalsize != NULL);
3120     for (int i = 0; i < dims; i++)
3121     {
3122         size_t val = _localsize ? _localsize[i] :
3123             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3124         CV_Assert( val > 0 );
3125         total *= _globalsize[i];
3126         if (_globalsize[i] == 1 && !_localsize)
3127             val = 1;
3128         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3129     }
3130     CV_Assert(total > 0);
3131
3132     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3133 }
3134
3135
3136 static bool isRaiseErrorOnReuseAsyncKernel()
3137 {
3138     static bool initialized = false;
3139     static bool value = false;
3140     if (!initialized)
3141     {
3142         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3143         initialized = true;
3144     }
3145     return value;
3146 }
3147
3148 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3149         bool sync, int64* timeNS, const Queue& q)
3150 {
3151     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3152
3153     if (!handle)
3154     {
3155         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3156         return false;
3157     }
3158
3159     if (isAsyncRun)
3160     {
3161         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3162         if (isRaiseErrorOnReuseAsyncKernel())
3163             CV_Assert(0);
3164         return false;  // OpenCV 5.0: raise error
3165     }
3166     isAsyncRun = !sync;
3167
3168     if (isInProgress)
3169     {
3170         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3171         if (isRaiseErrorOnReuseAsyncKernel())
3172             CV_Assert(0);
3173         return false;  // OpenCV 5.0: raise error
3174     }
3175
3176     cl_command_queue qq = getQueue(q);
3177     if (haveTempDstUMats)
3178         sync = true;
3179     if (haveTempSrcUMats)
3180         sync = true;
3181     if (timeNS)
3182         sync = true;
3183     cl_event asyncEvent = 0;
3184     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3185                                            NULL, globalsize, localsize, 0, 0,
3186                                            (sync && !timeNS) ? 0 : &asyncEvent);
3187 #if !CV_OPENCL_SHOW_RUN_KERNELS
3188     if (retval != CL_SUCCESS)
3189 #endif
3190     {
3191         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%dx%dx%d, localsize=%s) sync=%s", name.c_str(), (int)dims,
3192                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3193                         (localsize ? cv::format("%dx%dx%d", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3194                         sync ? "true" : "false"
3195                         );
3196         if (retval != CL_SUCCESS)
3197         {
3198             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3199         }
3200 #if CV_OPENCL_TRACE_CHECK
3201         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3202 #else
3203         printf("%s\n", msg.c_str());
3204         fflush(stdout);
3205 #endif
3206     }
3207     if (sync || retval != CL_SUCCESS)
3208     {
3209         CV_OCL_DBG_CHECK(clFinish(qq));
3210         if (timeNS)
3211         {
3212             if (retval == CL_SUCCESS)
3213             {
3214                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3215                 cl_ulong startTime, stopTime;
3216                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3217                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3218                 *timeNS = (int64)(stopTime - startTime);
3219             }
3220             else
3221             {
3222                 *timeNS = -1;
3223             }
3224         }
3225         cleanupUMats();
3226     }
3227     else
3228     {
3229         addref();
3230         isInProgress = true;
3231         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3232     }
3233     if (asyncEvent)
3234         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3235     return retval == CL_SUCCESS;
3236 }
3237
3238 bool Kernel::runTask(bool sync, const Queue& q)
3239 {
3240     if(!p || !p->handle || p->isInProgress)
3241         return false;
3242
3243     cl_command_queue qq = getQueue(q);
3244     cl_event asyncEvent = 0;
3245     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3246     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3247     if (sync || retval != CL_SUCCESS)
3248     {
3249         CV_OCL_DBG_CHECK(clFinish(qq));
3250         p->cleanupUMats();
3251     }
3252     else
3253     {
3254         p->addref();
3255         p->isInProgress = true;
3256         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3257     }
3258     if (asyncEvent)
3259         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3260     return retval == CL_SUCCESS;
3261 }
3262
3263 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3264 {
3265     CV_Assert(p && p->handle && !p->isInProgress);
3266     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3267     CV_Assert(q.ptr());
3268     q.finish(); // call clFinish() on base queue
3269     Queue profilingQueue = q.getProfilingQueue();
3270     int64 timeNs = -1;
3271     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3272     return res ? timeNs : -1;
3273 }
3274
3275 size_t Kernel::workGroupSize() const
3276 {
3277     if(!p || !p->handle)
3278         return 0;
3279     size_t val = 0, retsz = 0;
3280     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3281     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3282     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3283     return status == CL_SUCCESS ? val : 0;
3284 }
3285
3286 size_t Kernel::preferedWorkGroupSizeMultiple() const
3287 {
3288     if(!p || !p->handle)
3289         return 0;
3290     size_t val = 0, retsz = 0;
3291     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3292     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3293     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3294     return status == CL_SUCCESS ? val : 0;
3295 }
3296
3297 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3298 {
3299     if(!p || !p->handle || !wsz)
3300         return 0;
3301     size_t retsz = 0;
3302     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3303     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3304     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3305     return status == CL_SUCCESS;
3306 }
3307
3308 size_t Kernel::localMemSize() const
3309 {
3310     if(!p || !p->handle)
3311         return 0;
3312     size_t retsz = 0;
3313     cl_ulong val = 0;
3314     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3315     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3316     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3317     return status == CL_SUCCESS ? (size_t)val : 0;
3318 }
3319
3320
3321
3322 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3323
3324 struct ProgramSource::Impl
3325 {
3326     IMPLEMENT_REFCOUNTABLE();
3327
3328     enum KIND {
3329         PROGRAM_SOURCE_CODE = 0,
3330         PROGRAM_BINARIES,
3331         PROGRAM_SPIR,
3332         PROGRAM_SPIRV
3333     } kind_;
3334
3335     Impl(const String& src)
3336     {
3337         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3338         initFromSource(src, cv::String());
3339     }
3340     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3341     {
3342         init(PROGRAM_SOURCE_CODE, module, name);
3343         initFromSource(codeStr, codeHash);
3344     }
3345
3346     /// reset fields
3347     void init(enum KIND kind, const String& module, const String& name)
3348     {
3349         refcount = 1;
3350         kind_ = kind;
3351         module_ = module;
3352         name_ = name;
3353
3354         sourceAddr_ = NULL;
3355         sourceSize_ = 0;
3356         isHashUpdated = false;
3357     }
3358
3359     void initFromSource(const String& codeStr, const String& codeHash)
3360     {
3361         codeStr_ = codeStr;
3362         sourceHash_ = codeHash;
3363         if (sourceHash_.empty())
3364         {
3365             updateHash();
3366         }
3367         else
3368         {
3369             isHashUpdated = true;
3370         }
3371     }
3372
3373     void updateHash(const char* hashStr = NULL)
3374     {
3375         if (hashStr)
3376         {
3377             sourceHash_ = cv::String(hashStr);
3378             isHashUpdated = true;
3379             return;
3380         }
3381         uint64 hash = 0;
3382         switch (kind_)
3383         {
3384         case PROGRAM_SOURCE_CODE:
3385             if (sourceAddr_)
3386             {
3387                 CV_Assert(codeStr_.empty());
3388                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3389             }
3390             else
3391             {
3392                 CV_Assert(!codeStr_.empty());
3393                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3394             }
3395             break;
3396         case PROGRAM_BINARIES:
3397         case PROGRAM_SPIR:
3398         case PROGRAM_SPIRV:
3399             hash = crc64(sourceAddr_, sourceSize_);
3400             break;
3401         default:
3402             CV_Error(Error::StsInternal, "Internal error");
3403         }
3404         sourceHash_ = cv::format("%08llx", hash);
3405         isHashUpdated = true;
3406     }
3407
3408     Impl(enum KIND kind,
3409             const String& module, const String& name,
3410             const unsigned char* binary, const size_t size,
3411             const cv::String& buildOptions = cv::String())
3412     {
3413         init(kind, module, name);
3414
3415         sourceAddr_ = binary;
3416         sourceSize_ = size;
3417
3418         buildOptions_ = buildOptions;
3419     }
3420
3421     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3422             const char* sourceCodeStaticStr, const char* hashStaticStr,
3423             const cv::String& buildOptions)
3424     {
3425         ProgramSource result;
3426         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3427                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3428         result.p->updateHash(hashStaticStr);
3429         return result;
3430     }
3431
3432     static ProgramSource fromBinary(const String& module, const String& name,
3433             const unsigned char* binary, const size_t size,
3434             const cv::String& buildOptions)
3435     {
3436         ProgramSource result;
3437         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3438         return result;
3439     }
3440
3441     static ProgramSource fromSPIR(const String& module, const String& name,
3442             const unsigned char* binary, const size_t size,
3443             const cv::String& buildOptions)
3444     {
3445         ProgramSource result;
3446         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3447         return result;
3448     }
3449
3450     String module_;
3451     String name_;
3452
3453     // TODO std::vector<ProgramSource> includes_;
3454     String codeStr_; // PROGRAM_SOURCE_CODE only
3455
3456     const unsigned char* sourceAddr_;
3457     size_t sourceSize_;
3458
3459     cv::String buildOptions_;
3460
3461     String sourceHash_;
3462     bool isHashUpdated;
3463
3464     friend struct Program::Impl;
3465     friend struct internal::ProgramEntry;
3466     friend struct Context::Impl;
3467 };
3468
3469
3470 ProgramSource::ProgramSource()
3471 {
3472     p = 0;
3473 }
3474
3475 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
3476 {
3477     p = new Impl(module, name, codeStr, codeHash);
3478 }
3479
3480 ProgramSource::ProgramSource(const char* prog)
3481 {
3482     p = new Impl(prog);
3483 }
3484
3485 ProgramSource::ProgramSource(const String& prog)
3486 {
3487     p = new Impl(prog);
3488 }
3489
3490 ProgramSource::~ProgramSource()
3491 {
3492     if(p)
3493         p->release();
3494 }
3495
3496 ProgramSource::ProgramSource(const ProgramSource& prog)
3497 {
3498     p = prog.p;
3499     if(p)
3500         p->addref();
3501 }
3502
3503 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3504 {
3505     Impl* newp = (Impl*)prog.p;
3506     if(newp)
3507         newp->addref();
3508     if(p)
3509         p->release();
3510     p = newp;
3511     return *this;
3512 }
3513
3514 const String& ProgramSource::source() const
3515 {
3516     CV_Assert(p);
3517     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
3518     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
3519     return p->codeStr_;
3520 }
3521
3522 ProgramSource::hash_t ProgramSource::hash() const
3523 {
3524     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
3525 }
3526
3527 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
3528         const unsigned char* binary, const size_t size,
3529         const cv::String& buildOptions)
3530 {
3531     CV_Assert(binary);
3532     CV_Assert(size > 0);
3533     return Impl::fromBinary(module, name, binary, size, buildOptions);
3534 }
3535
3536 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
3537         const unsigned char* binary, const size_t size,
3538         const cv::String& buildOptions)
3539 {
3540     CV_Assert(binary);
3541     CV_Assert(size > 0);
3542     return Impl::fromBinary(module, name, binary, size, buildOptions);
3543 }
3544
3545
3546 internal::ProgramEntry::operator ProgramSource&() const
3547 {
3548     if (this->pProgramSource == NULL)
3549     {
3550         cv::AutoLock lock(cv::getInitializationMutex());
3551         if (this->pProgramSource == NULL)
3552         {
3553             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
3554             ProgramSource* ptr = new ProgramSource(ps);
3555             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
3556         }
3557     }
3558     return *this->pProgramSource;
3559 }
3560
3561
3562
3563 /////////////////////////////////////////// Program /////////////////////////////////////////////
3564
3565 static
3566 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
3567 {
3568     if (b.empty())
3569         return a;
3570     if (a.empty())
3571         return b;
3572     if (b[0] == ' ')
3573         return a + b;
3574     return a + (cv::String(" ") + b);
3575 }
3576
3577 struct Program::Impl
3578 {
3579     IMPLEMENT_REFCOUNTABLE();
3580
3581     Impl(const ProgramSource& src,
3582          const String& _buildflags, String& errmsg) :
3583          refcount(1),
3584          handle(NULL),
3585          buildflags(_buildflags)
3586     {
3587         const ProgramSource::Impl* src_ = src.getImpl();
3588         CV_Assert(src_);
3589         sourceModule_ = src_->module_;
3590         sourceName_ = src_->name_;
3591         const Context ctx = Context::getDefault();
3592         Device device = ctx.device(0);
3593         if (ctx.ptr() == NULL || device.ptr() == NULL)
3594             return;
3595         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
3596         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3597         {
3598             if (device.isAMD())
3599                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
3600             else if (device.isIntel())
3601                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
3602             const String param_buildExtraOptions = getBuildExtraOptions();
3603             if (!param_buildExtraOptions.empty())
3604                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
3605         }
3606         compile(ctx, src_, errmsg);
3607     }
3608
3609     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3610     {
3611         CV_Assert(ctx.getImpl());
3612         CV_Assert(src_);
3613
3614         // We don't cache OpenCL binaries
3615         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
3616         {
3617             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3618             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3619             return isLoaded;
3620         }
3621         return compileWithCache(ctx, src_, errmsg);
3622     }
3623
3624     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3625     {
3626         CV_Assert(ctx.getImpl());
3627         CV_Assert(src_);
3628         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
3629
3630 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3631         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
3632         const std::string base_dir = config.prepareCacheDirectoryForContext(
3633                 ctx.getImpl()->getPrefixString(),
3634                 ctx.getImpl()->getPrefixBase()
3635         );
3636         const String& hash_str = src_->sourceHash_;
3637         cv::String fname;
3638         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
3639         {
3640             CV_Assert(!hash_str.empty());
3641             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
3642             fname = utils::fs::join(base_dir, fname);
3643         }
3644         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
3645         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
3646         {
3647             try
3648             {
3649                 std::vector<char> binaryBuf;
3650                 bool res = false;
3651                 {
3652                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3653                     BinaryProgramFile file(fname, hash_str.c_str());
3654                     res = file.read(buildflags, binaryBuf);
3655                 }
3656                 if (res)
3657                 {
3658                     CV_Assert(!binaryBuf.empty());
3659                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
3660                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
3661                     if (isLoaded)
3662                         return true;
3663                 }
3664             }
3665             catch (const cv::Exception& e)
3666             {
3667                 CV_UNUSED(e);
3668                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
3669             }
3670             catch (...)
3671             {
3672                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
3673             }
3674         }
3675 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3676         CV_Assert(handle == NULL);
3677         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3678         {
3679             if (!buildFromSources(ctx, src_, errmsg))
3680             {
3681                 return false;
3682             }
3683         }
3684         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
3685         {
3686             buildflags = joinBuildOptions(buildflags, " -x spir");
3687             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
3688             {
3689                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
3690             }
3691             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3692             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3693             if (!isLoaded)
3694                 return false;
3695         }
3696         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
3697         {
3698             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
3699         }
3700         else
3701         {
3702             CV_Error(Error::StsInternal, "Internal error");
3703         }
3704         CV_Assert(handle != NULL);
3705 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3706         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
3707         {
3708             try
3709             {
3710                 std::vector<char> binaryBuf;
3711                 getProgramBinary(binaryBuf);
3712                 {
3713                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3714                     BinaryProgramFile file(fname, hash_str.c_str());
3715                     file.write(buildflags, binaryBuf);
3716                 }
3717             }
3718             catch (const cv::Exception& e)
3719             {
3720                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
3721             }
3722             catch (...)
3723             {
3724                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
3725             }
3726         }
3727 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3728 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3729         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3730         {
3731             std::vector<char> binaryBuf;
3732             getProgramBinary(binaryBuf);
3733             if (!binaryBuf.empty())
3734             {
3735                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3736                 handle = NULL;
3737                 createFromBinary(ctx, binaryBuf, errmsg);
3738             }
3739         }
3740 #endif
3741         return handle != NULL;
3742     }
3743
3744     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
3745     {
3746         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
3747
3748         size_t retsz = 0;
3749         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3750                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3751         if (log_retval == CL_SUCCESS && retsz > 1)
3752         {
3753             buffer.resize(retsz + 16);
3754             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3755                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
3756             if (log_retval == CL_SUCCESS)
3757             {
3758                 if (retsz < buffer.size())
3759                     buffer[retsz] = 0;
3760                 else
3761                     buffer[buffer.size() - 1] = 0;
3762             }
3763             else
3764             {
3765                 buffer[0] = 0;
3766             }
3767         }
3768
3769         errmsg = String(buffer.data());
3770         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
3771                 sourceModule_.c_str(), sourceName_.c_str(),
3772                 result, getOpenCLErrorString(result),
3773                 buildflags.c_str(), errmsg.c_str());
3774         fflush(stdout);
3775     }
3776
3777     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3778     {
3779         CV_Assert(src_);
3780         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
3781         CV_Assert(handle == NULL);
3782         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
3783                 sourceModule_.c_str(), sourceName_.c_str(),
3784                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
3785
3786         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
3787
3788         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
3789         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
3790         CV_Assert(srcptr != NULL);
3791         CV_Assert(srclen > 0);
3792
3793         cl_int retval = 0;
3794
3795         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3796         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
3797         CV_Assert(handle || retval != CL_SUCCESS);
3798         if (handle && retval == CL_SUCCESS)
3799         {
3800             size_t n = ctx.ndevices();
3801             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
3802             cl_device_id* deviceList = deviceListBuf.data();
3803             for (size_t i = 0; i < n; i++)
3804             {
3805                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
3806             }
3807
3808             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
3809             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
3810 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3811             if (retval != CL_SUCCESS)
3812 #endif
3813             {
3814                 dumpBuildLog_(retval, deviceList, errmsg);
3815
3816                 // don't remove "retval != CL_SUCCESS" condition here:
3817                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
3818                 if (retval != CL_SUCCESS && handle)
3819                 {
3820                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3821                     handle = NULL;
3822                 }
3823             }
3824 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3825             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3826             {
3827                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
3828                 size_t retsz = 0;
3829                 char kernels_buffer[4096] = {0};
3830                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3831                 if (retsz < sizeof(kernels_buffer))
3832                     kernels_buffer[retsz] = 0;
3833                 else
3834                     kernels_buffer[0] = 0;
3835                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3836             }
3837 #endif
3838
3839         }
3840         return handle != NULL;
3841     }
3842
3843     void getProgramBinary(std::vector<char>& buf)
3844     {
3845         CV_Assert(handle);
3846         size_t sz = 0;
3847         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
3848         buf.resize(sz);
3849         uchar* ptr = (uchar*)&buf[0];
3850         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
3851     }
3852
3853     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
3854     {
3855         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
3856     }
3857
3858     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
3859     {
3860         CV_Assert(handle == NULL);
3861         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
3862         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
3863
3864         CV_Assert(binarySize > 0);
3865
3866         size_t ndevices = (int)ctx.ndevices();
3867         AutoBuffer<cl_device_id> devices_(ndevices);
3868         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
3869         AutoBuffer<size_t> binarySizes_(ndevices);
3870
3871         cl_device_id* devices = devices_.data();
3872         const uchar** binaryPtrs = binaryPtrs_.data();
3873         size_t* binarySizes = binarySizes_.data();
3874         for (size_t i = 0; i < ndevices; i++)
3875         {
3876             devices[i] = (cl_device_id)ctx.device(i).ptr();
3877             binaryPtrs[i] = binaryAddr;
3878             binarySizes[i] = binarySize;
3879         }
3880
3881         cl_int result = 0;
3882         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
3883                                            binarySizes, binaryPtrs, NULL, &result);
3884         if (result != CL_SUCCESS)
3885         {
3886             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
3887             if (handle)
3888             {
3889                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3890                 handle = NULL;
3891             }
3892         }
3893         if (!handle)
3894         {
3895             return false;
3896         }
3897         // call clBuildProgram()
3898         {
3899             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
3900             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
3901             if (result != CL_SUCCESS)
3902             {
3903                 dumpBuildLog_(result, devices, errmsg);
3904                 if (handle)
3905                 {
3906                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3907                     handle = NULL;
3908                 }
3909                 return false;
3910             }
3911         }
3912         // check build status
3913         {
3914             cl_build_status build_status = CL_BUILD_NONE;
3915             size_t retsz = 0;
3916             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
3917                     sizeof(build_status), &build_status, &retsz));
3918             if (result == CL_SUCCESS)
3919             {
3920                 if (build_status == CL_BUILD_SUCCESS)
3921                 {
3922                     return true;
3923                 }
3924                 else
3925                 {
3926                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
3927                     return false;
3928                 }
3929             }
3930             else
3931             {
3932                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
3933                 if (handle)
3934                 {
3935                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3936                     handle = NULL;
3937                 }
3938             }
3939         }
3940 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3941         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3942         {
3943             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
3944             size_t retsz = 0;
3945             char kernels_buffer[4096] = {0};
3946             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3947             if (retsz < sizeof(kernels_buffer))
3948                 kernels_buffer[retsz] = 0;
3949             else
3950                 kernels_buffer[0] = 0;
3951             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3952         }
3953 #endif
3954         return handle != NULL;
3955     }
3956
3957     ~Impl()
3958     {
3959         if( handle )
3960         {
3961 #ifdef _WIN32
3962             if (!cv::__termination)
3963 #endif
3964             {
3965                 clReleaseProgram(handle);
3966             }
3967             handle = NULL;
3968         }
3969     }
3970
3971     cl_program handle;
3972
3973     String buildflags;
3974     String sourceModule_;
3975     String sourceName_;
3976 };
3977
3978
3979 Program::Program() { p = 0; }
3980
3981 Program::Program(const ProgramSource& src,
3982         const String& buildflags, String& errmsg)
3983 {
3984     p = 0;
3985     create(src, buildflags, errmsg);
3986 }
3987
3988 Program::Program(const Program& prog)
3989 {
3990     p = prog.p;
3991     if(p)
3992         p->addref();
3993 }
3994
3995 Program& Program::operator = (const Program& prog)
3996 {
3997     Impl* newp = (Impl*)prog.p;
3998     if(newp)
3999         newp->addref();
4000     if(p)
4001         p->release();
4002     p = newp;
4003     return *this;
4004 }
4005
4006 Program::~Program()
4007 {
4008     if(p)
4009         p->release();
4010 }
4011
4012 bool Program::create(const ProgramSource& src,
4013             const String& buildflags, String& errmsg)
4014 {
4015     if(p)
4016     {
4017         p->release();
4018         p = NULL;
4019     }
4020     p = new Impl(src, buildflags, errmsg);
4021     if(!p->handle)
4022     {
4023         p->release();
4024         p = 0;
4025     }
4026     return p != 0;
4027 }
4028
4029 void* Program::ptr() const
4030 {
4031     return p ? p->handle : 0;
4032 }
4033
4034 #ifndef OPENCV_REMOVE_DEPRECATED_API
4035 const ProgramSource& Program::source() const
4036 {
4037     CV_Error(Error::StsNotImplemented, "Removed API");
4038 }
4039
4040 bool Program::read(const String& bin, const String& buildflags)
4041 {
4042     CV_UNUSED(bin); CV_UNUSED(buildflags);
4043     CV_Error(Error::StsNotImplemented, "Removed API");
4044 }
4045
4046 bool Program::write(String& bin) const
4047 {
4048     CV_UNUSED(bin);
4049     CV_Error(Error::StsNotImplemented, "Removed API");
4050 }
4051
4052 String Program::getPrefix() const
4053 {
4054     if(!p)
4055         return String();
4056     Context::Impl* ctx_ = Context::getDefault().getImpl();
4057     CV_Assert(ctx_);
4058     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4059 }
4060
4061 String Program::getPrefix(const String& buildflags)
4062 {
4063         Context::Impl* ctx_ = Context::getDefault().getImpl();
4064         CV_Assert(ctx_);
4065         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4066 }
4067 #endif // OPENCV_REMOVE_DEPRECATED_API
4068
4069 void Program::getBinary(std::vector<char>& binary) const
4070 {
4071     CV_Assert(p && "Empty program");
4072     p->getProgramBinary(binary);
4073 }
4074
4075 Program Context::Impl::getProg(const ProgramSource& src,
4076                                const String& buildflags, String& errmsg)
4077 {
4078     size_t limit = getProgramCountLimit();
4079     const ProgramSource::Impl* src_ = src.getImpl();
4080     CV_Assert(src_);
4081     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4082             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4083             getPrefixString().c_str(),
4084             buildflags.c_str());
4085     {
4086         cv::AutoLock lock(program_cache_mutex);
4087         phash_t::iterator it = phash.find(key);
4088         if (it != phash.end())
4089         {
4090             // TODO LRU cache
4091             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4092             if (i != cacheList.end() && i != cacheList.begin())
4093             {
4094                 cacheList.erase(i);
4095                 cacheList.push_front(key);
4096             }
4097             return it->second;
4098         }
4099         { // cleanup program cache
4100             size_t sz = phash.size();
4101             if (limit > 0 && sz >= limit)
4102             {
4103                 static bool warningFlag = false;
4104                 if (!warningFlag)
4105                 {
4106                     printf("\nWARNING: OpenCV-OpenCL:\n"
4107                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4108                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4109                     warningFlag = true;
4110                 }
4111                 while (!cacheList.empty())
4112                 {
4113                     size_t c = phash.erase(cacheList.back());
4114                     cacheList.pop_back();
4115                     if (c != 0)
4116                         break;
4117                 }
4118             }
4119         }
4120     }
4121     Program prog(src, buildflags, errmsg);
4122     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4123     {
4124         cv::AutoLock lock(program_cache_mutex);
4125         phash.insert(std::pair<std::string, Program>(key, prog));
4126         cacheList.push_front(key);
4127     }
4128     return prog;
4129 }
4130
4131
4132 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4133
4134 template<typename T>
4135 class OpenCLBufferPool
4136 {
4137 protected:
4138     ~OpenCLBufferPool() { }
4139 public:
4140     virtual T allocate(size_t size) = 0;
4141     virtual void release(T buffer) = 0;
4142 };
4143
4144 template <typename Derived, typename BufferEntry, typename T>
4145 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4146 {
4147 private:
4148     inline Derived& derived() { return *static_cast<Derived*>(this); }
4149 protected:
4150     Mutex mutex_;
4151
4152     size_t currentReservedSize;
4153     size_t maxReservedSize;
4154
4155     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4156     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4157
4158     // synchronized
4159     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4160     {
4161         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4162         for (; i != allocatedEntries_.end(); ++i)
4163         {
4164             BufferEntry& e = *i;
4165             if (e.clBuffer_ == buffer)
4166             {
4167                 entry = e;
4168                 allocatedEntries_.erase(i);
4169                 return true;
4170             }
4171         }
4172         return false;
4173     }
4174
4175     // synchronized
4176     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4177     {
4178         if (reservedEntries_.empty())
4179             return false;
4180         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4181         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4182         BufferEntry result;
4183         size_t minDiff = (size_t)(-1);
4184         for (; i != reservedEntries_.end(); ++i)
4185         {
4186             BufferEntry& e = *i;
4187             if (e.capacity_ >= size)
4188             {
4189                 size_t diff = e.capacity_ - size;
4190                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4191                 {
4192                     minDiff = diff;
4193                     result_pos = i;
4194                     result = e;
4195                     if (diff == 0)
4196                         break;
4197                 }
4198             }
4199         }
4200         if (result_pos != reservedEntries_.end())
4201         {
4202             //CV_DbgAssert(result == *result_pos);
4203             reservedEntries_.erase(result_pos);
4204             entry = result;
4205             currentReservedSize -= entry.capacity_;
4206             allocatedEntries_.push_back(entry);
4207             return true;
4208         }
4209         return false;
4210     }
4211
4212     // synchronized
4213     void _checkSizeOfReservedEntries()
4214     {
4215         while (currentReservedSize > maxReservedSize)
4216         {
4217             CV_DbgAssert(!reservedEntries_.empty());
4218             const BufferEntry& entry = reservedEntries_.back();
4219             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4220             currentReservedSize -= entry.capacity_;
4221             derived()._releaseBufferEntry(entry);
4222             reservedEntries_.pop_back();
4223         }
4224     }
4225
4226     inline size_t _allocationGranularity(size_t size)
4227     {
4228         // heuristic values
4229         if (size < 1024*1024)
4230             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4231         else if (size < 16*1024*1024)
4232             return 64*1024;
4233         else
4234             return 1024*1024;
4235     }
4236
4237 public:
4238     OpenCLBufferPoolBaseImpl()
4239         : currentReservedSize(0),
4240           maxReservedSize(0)
4241     {
4242         // nothing
4243     }
4244     virtual ~OpenCLBufferPoolBaseImpl()
4245     {
4246         freeAllReservedBuffers();
4247         CV_Assert(reservedEntries_.empty());
4248     }
4249 public:
4250     virtual T allocate(size_t size) CV_OVERRIDE
4251     {
4252         AutoLock locker(mutex_);
4253         BufferEntry entry;
4254         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4255         {
4256             CV_DbgAssert(size <= entry.capacity_);
4257             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4258         }
4259         else
4260         {
4261             derived()._allocateBufferEntry(entry, size);
4262         }
4263         return entry.clBuffer_;
4264     }
4265     virtual void release(T buffer) CV_OVERRIDE
4266     {
4267         AutoLock locker(mutex_);
4268         BufferEntry entry;
4269         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4270         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4271         {
4272             derived()._releaseBufferEntry(entry);
4273         }
4274         else
4275         {
4276             reservedEntries_.push_front(entry);
4277             currentReservedSize += entry.capacity_;
4278             _checkSizeOfReservedEntries();
4279         }
4280     }
4281
4282     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4283     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4284     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4285     {
4286         AutoLock locker(mutex_);
4287         size_t oldMaxReservedSize = maxReservedSize;
4288         maxReservedSize = size;
4289         if (maxReservedSize < oldMaxReservedSize)
4290         {
4291             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4292             for (; i != reservedEntries_.end();)
4293             {
4294                 const BufferEntry& entry = *i;
4295                 if (entry.capacity_ > maxReservedSize / 8)
4296                 {
4297                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4298                     currentReservedSize -= entry.capacity_;
4299                     derived()._releaseBufferEntry(entry);
4300                     i = reservedEntries_.erase(i);
4301                     continue;
4302                 }
4303                 ++i;
4304             }
4305             _checkSizeOfReservedEntries();
4306         }
4307     }
4308     virtual void freeAllReservedBuffers() CV_OVERRIDE
4309     {
4310         AutoLock locker(mutex_);
4311         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4312         for (; i != reservedEntries_.end(); ++i)
4313         {
4314             const BufferEntry& entry = *i;
4315             derived()._releaseBufferEntry(entry);
4316         }
4317         reservedEntries_.clear();
4318         currentReservedSize = 0;
4319     }
4320 };
4321
4322 struct CLBufferEntry
4323 {
4324     cl_mem clBuffer_;
4325     size_t capacity_;
4326     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4327 };
4328
4329 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4330 {
4331 public:
4332     typedef struct CLBufferEntry BufferEntry;
4333 protected:
4334     int createFlags_;
4335 public:
4336     OpenCLBufferPoolImpl(int createFlags = 0)
4337         : createFlags_(createFlags)
4338     {
4339     }
4340
4341     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4342     {
4343         CV_DbgAssert(entry.clBuffer_ == NULL);
4344         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4345         Context& ctx = Context::getDefault();
4346         cl_int retval = CL_SUCCESS;
4347         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4348         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4349         CV_Assert(entry.clBuffer_ != NULL);
4350         if(retval == CL_SUCCESS)
4351         {
4352             CV_IMPL_ADD(CV_IMPL_OCL);
4353         }
4354         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4355                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4356         allocatedEntries_.push_back(entry);
4357     }
4358
4359     void _releaseBufferEntry(const BufferEntry& entry)
4360     {
4361         CV_Assert(entry.capacity_ != 0);
4362         CV_Assert(entry.clBuffer_ != NULL);
4363         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4364                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4365         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4366     }
4367 };
4368
4369 #ifdef HAVE_OPENCL_SVM
4370 struct CLSVMBufferEntry
4371 {
4372     void* clBuffer_;
4373     size_t capacity_;
4374     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4375 };
4376 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4377 {
4378 public:
4379     typedef struct CLSVMBufferEntry BufferEntry;
4380 public:
4381     OpenCLSVMBufferPoolImpl()
4382     {
4383     }
4384
4385     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4386     {
4387         CV_DbgAssert(entry.clBuffer_ == NULL);
4388         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4389
4390         Context& ctx = Context::getDefault();
4391         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4392         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4393         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4394                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4395
4396         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4397         CV_DbgAssert(svmFns->isValid());
4398
4399         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4400         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4401         CV_Assert(buf);
4402
4403         entry.clBuffer_ = buf;
4404         {
4405             CV_IMPL_ADD(CV_IMPL_OCL);
4406         }
4407         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4408                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4409         allocatedEntries_.push_back(entry);
4410     }
4411
4412     void _releaseBufferEntry(const BufferEntry& entry)
4413     {
4414         CV_Assert(entry.capacity_ != 0);
4415         CV_Assert(entry.clBuffer_ != NULL);
4416         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4417                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4418         Context& ctx = Context::getDefault();
4419         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4420         CV_DbgAssert(svmFns->isValid());
4421         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4422         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4423     }
4424 };
4425 #endif
4426
4427
4428
4429 template <bool readAccess, bool writeAccess>
4430 class AlignedDataPtr
4431 {
4432 protected:
4433     const size_t size_;
4434     uchar* const originPtr_;
4435     const size_t alignment_;
4436     uchar* ptr_;
4437     uchar* allocatedPtr_;
4438
4439 public:
4440     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4441         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4442     {
4443         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4444         CV_DbgAssert(!readAccess || ptr);
4445         if (((size_t)ptr_ & (alignment - 1)) != 0)
4446         {
4447             allocatedPtr_ = new uchar[size_ + alignment - 1];
4448             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4449             if (readAccess)
4450             {
4451                 memcpy(ptr_, originPtr_, size_);
4452             }
4453         }
4454     }
4455
4456     uchar* getAlignedPtr() const
4457     {
4458         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4459         return ptr_;
4460     }
4461
4462     ~AlignedDataPtr()
4463     {
4464         if (allocatedPtr_)
4465         {
4466             if (writeAccess)
4467             {
4468                 memcpy(originPtr_, ptr_, size_);
4469             }
4470             delete[] allocatedPtr_;
4471             allocatedPtr_ = NULL;
4472         }
4473         ptr_ = NULL;
4474     }
4475 private:
4476     AlignedDataPtr(const AlignedDataPtr&); // disabled
4477     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4478 };
4479
4480 template <bool readAccess, bool writeAccess>
4481 class AlignedDataPtr2D
4482 {
4483 protected:
4484     const size_t size_;
4485     uchar* const originPtr_;
4486     const size_t alignment_;
4487     uchar* ptr_;
4488     uchar* allocatedPtr_;
4489     size_t rows_;
4490     size_t cols_;
4491     size_t step_;
4492
4493 public:
4494     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
4495         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
4496     {
4497         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4498         CV_DbgAssert(!readAccess || ptr != NULL);
4499         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
4500         {
4501             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
4502             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4503             if (readAccess)
4504             {
4505                 for (size_t i = 0; i < rows_; i++)
4506                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
4507             }
4508         }
4509     }
4510
4511     uchar* getAlignedPtr() const
4512     {
4513         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4514         return ptr_;
4515     }
4516
4517     ~AlignedDataPtr2D()
4518     {
4519         if (allocatedPtr_)
4520         {
4521             if (writeAccess)
4522             {
4523                 for (size_t i = 0; i < rows_; i++)
4524                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
4525             }
4526             delete[] allocatedPtr_;
4527             allocatedPtr_ = NULL;
4528         }
4529         ptr_ = NULL;
4530     }
4531 private:
4532     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
4533     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
4534 };
4535
4536 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4537 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4538 #endif
4539
4540 class OpenCLAllocator CV_FINAL : public MatAllocator
4541 {
4542     mutable OpenCLBufferPoolImpl bufferPool;
4543     mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4544 #ifdef  HAVE_OPENCL_SVM
4545     mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4546 #endif
4547
4548 public:
4549     enum AllocatorFlags
4550     {
4551         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4552         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
4553 #ifdef HAVE_OPENCL_SVM
4554         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
4555 #endif
4556         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
4557     };
4558
4559     OpenCLAllocator()
4560         : bufferPool(0),
4561           bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4562     {
4563         size_t defaultPoolSize, poolSize;
4564         defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4565         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4566         bufferPool.setMaxReservedSize(poolSize);
4567         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4568         bufferPoolHostPtr.setMaxReservedSize(poolSize);
4569 #ifdef HAVE_OPENCL_SVM
4570         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4571         bufferPoolSVM.setMaxReservedSize(poolSize);
4572 #endif
4573
4574         matStdAllocator = Mat::getDefaultAllocator();
4575     }
4576     ~OpenCLAllocator()
4577     {
4578         flushCleanupQueue();
4579     }
4580
4581     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4582             int flags, UMatUsageFlags usageFlags) const
4583     {
4584         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4585         return u;
4586     }
4587
4588     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
4589     {
4590         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
4591         return value;
4592     }
4593     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
4594     {
4595         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
4596         return value;
4597     }
4598
4599     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4600     {
4601         const Device& dev = ctx.device(0);
4602         createFlags = 0;
4603         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4604             createFlags |= CL_MEM_ALLOC_HOST_PTR;
4605
4606         if (!isOpenCLCopyingForced() &&
4607             (isOpenCLMapForced() ||
4608                 (dev.hostUnifiedMemory()
4609 #ifndef __APPLE__
4610                 || dev.isIntel()
4611 #endif
4612                 )
4613             )
4614         )
4615             flags0 = 0;
4616         else
4617             flags0 = UMatData::COPY_ON_MAP;
4618     }
4619
4620     UMatData* allocate(int dims, const int* sizes, int type,
4621                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4622     {
4623         if(!useOpenCL())
4624             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4625         CV_Assert(data == 0);
4626         size_t total = CV_ELEM_SIZE(type);
4627         for( int i = dims-1; i >= 0; i-- )
4628         {
4629             if( step )
4630                 step[i] = total;
4631             total *= sizes[i];
4632         }
4633
4634         Context& ctx = Context::getDefault();
4635         flushCleanupQueue();
4636
4637         int createFlags = 0, flags0 = 0;
4638         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4639
4640         void* handle = NULL;
4641         int allocatorFlags = 0;
4642
4643 #ifdef HAVE_OPENCL_SVM
4644         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4645         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4646         {
4647             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4648             handle = bufferPoolSVM.allocate(total);
4649
4650             // this property is constant, so single buffer pool can be used here
4651             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4652             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4653         }
4654         else
4655 #endif
4656         if (createFlags == 0)
4657         {
4658             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4659             handle = bufferPool.allocate(total);
4660         }
4661         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4662         {
4663             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4664             handle = bufferPoolHostPtr.allocate(total);
4665         }
4666         else
4667         {
4668             CV_Assert(handle != NULL); // Unsupported, throw
4669         }
4670
4671         if (!handle)
4672             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4673
4674         UMatData* u = new UMatData(this);
4675         u->data = 0;
4676         u->size = total;
4677         u->handle = handle;
4678         u->flags = flags0;
4679         u->allocatorFlags_ = allocatorFlags;
4680         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4681         u->markHostCopyObsolete(true);
4682         opencl_allocator_stats.onAllocate(u->size);
4683         return u;
4684     }
4685
4686     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4687     {
4688         if(!u)
4689             return false;
4690
4691         flushCleanupQueue();
4692
4693         UMatDataAutoLock lock(u);
4694
4695         if(u->handle == 0)
4696         {
4697             CV_Assert(u->origdata != 0);
4698             Context& ctx = Context::getDefault();
4699             int createFlags = 0, flags0 = 0;
4700             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4701
4702             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
4703
4704             cl_context ctx_handle = (cl_context)ctx.ptr();
4705             int allocatorFlags = 0;
4706             int tempUMatFlags = 0;
4707             void* handle = NULL;
4708             cl_int retval = CL_SUCCESS;
4709
4710 #ifdef HAVE_OPENCL_SVM
4711             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4712             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4713             if (useSVM && svmCaps.isSupportFineGrainSystem())
4714             {
4715                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4716                 tempUMatFlags = UMatData::TEMP_UMAT;
4717                 handle = u->origdata;
4718                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4719             }
4720             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4721             {
4722                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
4723                 {
4724                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4725
4726                     cl_svm_mem_flags memFlags = createFlags |
4727                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4728
4729                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4730                     CV_DbgAssert(svmFns->isValid());
4731
4732                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4733                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4734                     CV_Assert(handle);
4735
4736                     cl_command_queue q = NULL;
4737                     if (!isFineGrainBuffer)
4738                     {
4739                         q = (cl_command_queue)Queue::getDefault().ptr();
4740                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4741                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4742                                 handle, u->size,
4743                                 0, NULL, NULL);
4744                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4745
4746                     }
4747                     memcpy(handle, u->origdata, u->size);
4748                     if (!isFineGrainBuffer)
4749                     {
4750                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4751                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4752                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4753                     }
4754
4755                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4756                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4757                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4758                 }
4759             }
4760             else
4761 #endif
4762             {
4763                 if( copyOnMap )
4764                     accessFlags &= ~ACCESS_FAST;
4765
4766                 tempUMatFlags = UMatData::TEMP_UMAT;
4767                 if (
4768                 #ifdef __APPLE__
4769                     !copyOnMap &&
4770                 #endif
4771                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
4772                     // There are OpenCL runtime issues for less aligned data
4773                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
4774                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
4775                     // Avoid sharing of host memory between OpenCL buffers
4776                     && !(u->originalUMatData && u->originalUMatData->handle)
4777                 )
4778                 {
4779                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4780                                             u->size, u->origdata, &retval);
4781                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
4782                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4783                 }
4784                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4785                 {
4786                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4787                                                u->size, u->origdata, &retval);
4788                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
4789                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4790                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4791                 }
4792             }
4793             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
4794             if(!handle || retval != CL_SUCCESS)
4795                 return false;
4796             u->handle = handle;
4797             u->prevAllocator = u->currAllocator;
4798             u->currAllocator = this;
4799             u->flags |= tempUMatFlags | flags0;
4800             u->allocatorFlags_ = allocatorFlags;
4801         }
4802         if(accessFlags & ACCESS_WRITE)
4803             u->markHostCopyObsolete(true);
4804         opencl_allocator_stats.onAllocate(u->size);
4805         return true;
4806     }
4807
4808     /*void sync(UMatData* u) const
4809     {
4810         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4811         UMatDataAutoLock lock(u);
4812
4813         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4814         {
4815             if( u->tempCopiedUMat() )
4816             {
4817                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4818                                     u->size, u->origdata, 0, 0, 0);
4819             }
4820             else
4821             {
4822                 cl_int retval = 0;
4823                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4824                                                 (CL_MAP_READ | CL_MAP_WRITE),
4825                                                 0, u->size, 0, 0, 0, &retval);
4826                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4827                 clFinish(q);
4828             }
4829             u->markHostCopyObsolete(false);
4830         }
4831         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4832         {
4833             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4834                                  u->size, u->data, 0, 0, 0);
4835         }
4836     }*/
4837
4838     void deallocate(UMatData* u) const CV_OVERRIDE
4839     {
4840         if(!u)
4841             return;
4842
4843         CV_Assert(u->urefcount == 0);
4844         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
4845
4846         CV_Assert(u->handle != 0);
4847         CV_Assert(u->mapcount == 0);
4848
4849         if (u->flags & UMatData::ASYNC_CLEANUP)
4850             addToCleanupQueue(u);
4851         else
4852             deallocate_(u);
4853     }
4854
4855     void deallocate_(UMatData* u) const
4856     {
4857         CV_Assert(u);
4858         CV_Assert(u->handle);
4859         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
4860         {
4861             opencl_allocator_stats.onFree(u->size);
4862         }
4863
4864 #ifdef _WIN32
4865         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
4866             return;             // avoid any OpenCL calls
4867 #endif
4868         if(u->tempUMat())
4869         {
4870             CV_Assert(u->origdata);
4871 //            UMatDataAutoLock lock(u);
4872
4873             if (u->hostCopyObsolete())
4874             {
4875 #ifdef HAVE_OPENCL_SVM
4876                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4877                 {
4878                     Context& ctx = Context::getDefault();
4879                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4880                     CV_DbgAssert(svmFns->isValid());
4881
4882                     if( u->tempCopiedUMat() )
4883                     {
4884                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4885                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4886                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4887                         cl_command_queue q = NULL;
4888                         if (!isFineGrainBuffer)
4889                         {
4890                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4891                             q = (cl_command_queue)Queue::getDefault().ptr();
4892                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4893                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4894                                     u->handle, u->size,
4895                                     0, NULL, NULL);
4896                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4897                         }
4898                         clFinish(q);
4899                         memcpy(u->origdata, u->handle, u->size);
4900                         if (!isFineGrainBuffer)
4901                         {
4902                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4903                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4904                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4905                         }
4906                     }
4907                     else
4908                     {
4909                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4910                         // nothing
4911                     }
4912                 }
4913                 else
4914 #endif
4915                 {
4916                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4917                     if( u->tempCopiedUMat() )
4918                     {
4919                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4920                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4921                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
4922                     }
4923                     else
4924                     {
4925                         cl_int retval = 0;
4926                         if (u->tempUMat())
4927                         {
4928                             CV_Assert(u->mapcount == 0);
4929                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
4930                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4931                                 (CL_MAP_READ | CL_MAP_WRITE),
4932                                 0, u->size, 0, 0, 0, &retval);
4933                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
4934                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
4935                             if (u->originalUMatData)
4936                             {
4937                                 CV_Assert(u->originalUMatData->data == data);
4938                             }
4939                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4940                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
4941                             CV_OCL_DBG_CHECK(clFinish(q));
4942                         }
4943                     }
4944                 }
4945                 u->markHostCopyObsolete(false);
4946             }
4947             else
4948             {
4949                 // nothing
4950             }
4951 #ifdef HAVE_OPENCL_SVM
4952             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4953             {
4954                 if( u->tempCopiedUMat() )
4955                 {
4956                     Context& ctx = Context::getDefault();
4957                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4958                     CV_DbgAssert(svmFns->isValid());
4959
4960                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4961                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4962                 }
4963             }
4964             else
4965 #endif
4966             {
4967                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
4968                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
4969             }
4970             u->handle = 0;
4971             u->markDeviceCopyObsolete(true);
4972             u->currAllocator = u->prevAllocator;
4973             u->prevAllocator = NULL;
4974             if(u->data && u->copyOnMap() && u->data != u->origdata)
4975                 fastFree(u->data);
4976             u->data = u->origdata;
4977             u->currAllocator->deallocate(u);
4978             u = NULL;
4979         }
4980         else
4981         {
4982             CV_Assert(u->origdata == NULL);
4983             if(u->data && u->copyOnMap() && u->data != u->origdata)
4984             {
4985                 fastFree(u->data);
4986                 u->data = 0;
4987                 u->markHostCopyObsolete(true);
4988             }
4989             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4990             {
4991                 bufferPool.release((cl_mem)u->handle);
4992             }
4993             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4994             {
4995                 bufferPoolHostPtr.release((cl_mem)u->handle);
4996             }
4997 #ifdef HAVE_OPENCL_SVM
4998             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4999             {
5000                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5001                 {
5002                     //nothing
5003                 }
5004                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5005                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5006                 {
5007                     Context& ctx = Context::getDefault();
5008                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5009                     CV_DbgAssert(svmFns->isValid());
5010                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5011
5012                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5013                     {
5014                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5015                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5016                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5017                     }
5018                 }
5019                 bufferPoolSVM.release((void*)u->handle);
5020             }
5021 #endif
5022             else
5023             {
5024                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5025             }
5026             u->handle = 0;
5027             u->markDeviceCopyObsolete(true);
5028             delete u;
5029             u = NULL;
5030         }
5031         CV_Assert(u == NULL);
5032     }
5033
5034     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5035     void map(UMatData* u, int accessFlags) const CV_OVERRIDE
5036     {
5037         CV_Assert(u && u->handle);
5038
5039         if(accessFlags & ACCESS_WRITE)
5040             u->markDeviceCopyObsolete(true);
5041
5042         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5043
5044         {
5045             if( !u->copyOnMap() )
5046             {
5047                 // TODO
5048                 // because there can be other map requests for the same UMat with different access flags,
5049                 // we use the universal (read-write) access mode.
5050 #ifdef HAVE_OPENCL_SVM
5051                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5052                 {
5053                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5054                     {
5055                         Context& ctx = Context::getDefault();
5056                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5057                         CV_DbgAssert(svmFns->isValid());
5058
5059                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5060                         {
5061                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5062                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5063                                     u->handle, u->size,
5064                                     0, NULL, NULL);
5065                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5066                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5067                         }
5068                     }
5069                     clFinish(q);
5070                     u->data = (uchar*)u->handle;
5071                     u->markHostCopyObsolete(false);
5072                     u->markDeviceMemMapped(true);
5073                     return;
5074                 }
5075 #endif
5076
5077                 cl_int retval = CL_SUCCESS;
5078                 if (!u->deviceMemMapped())
5079                 {
5080                     CV_Assert(u->refcount == 1);
5081                     CV_Assert(u->mapcount++ == 0);
5082                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5083                                                          (CL_MAP_READ | CL_MAP_WRITE),
5084                                                          0, u->size, 0, 0, 0, &retval);
5085                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5086                 }
5087                 if (u->data && retval == CL_SUCCESS)
5088                 {
5089                     u->markHostCopyObsolete(false);
5090                     u->markDeviceMemMapped(true);
5091                     return;
5092                 }
5093
5094                 // TODO Is it really a good idea and was it tested well?
5095                 // if map failed, switch to copy-on-map mode for the particular buffer
5096                 u->flags |= UMatData::COPY_ON_MAP;
5097             }
5098
5099             if(!u->data)
5100             {
5101                 u->data = (uchar*)fastMalloc(u->size);
5102                 u->markHostCopyObsolete(true);
5103             }
5104         }
5105
5106         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
5107         {
5108             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5109 #ifdef HAVE_OPENCL_SVM
5110             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5111 #endif
5112             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5113                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5114             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5115                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5116             u->markHostCopyObsolete(false);
5117         }
5118     }
5119
5120     void unmap(UMatData* u) const CV_OVERRIDE
5121     {
5122         if(!u)
5123             return;
5124
5125
5126         CV_Assert(u->handle != 0);
5127
5128         UMatDataAutoLock autolock(u);
5129
5130         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5131         cl_int retval = 0;
5132         if( !u->copyOnMap() && u->deviceMemMapped() )
5133         {
5134             CV_Assert(u->data != NULL);
5135 #ifdef HAVE_OPENCL_SVM
5136             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5137             {
5138                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5139                 {
5140                     Context& ctx = Context::getDefault();
5141                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5142                     CV_DbgAssert(svmFns->isValid());
5143
5144                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5145                     {
5146                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5147                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5148                                 0, NULL, NULL);
5149                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5150                         clFinish(q);
5151                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5152                     }
5153                 }
5154                 if (u->refcount == 0)
5155                     u->data = 0;
5156                 u->markDeviceCopyObsolete(false);
5157                 u->markHostCopyObsolete(true);
5158                 return;
5159             }
5160 #endif
5161             if (u->refcount == 0)
5162             {
5163                 CV_Assert(u->mapcount-- == 1);
5164                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5165                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5166                 if (Device::getDefault().isAMD())
5167                 {
5168                     // required for multithreaded applications (see stitching test)
5169                     CV_OCL_DBG_CHECK(clFinish(q));
5170                 }
5171                 u->markDeviceMemMapped(false);
5172                 u->data = 0;
5173                 u->markDeviceCopyObsolete(false);
5174                 u->markHostCopyObsolete(true);
5175             }
5176         }
5177         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5178         {
5179             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5180 #ifdef HAVE_OPENCL_SVM
5181             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5182 #endif
5183             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5184                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5185             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5186                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5187             u->markDeviceCopyObsolete(false);
5188             u->markHostCopyObsolete(true);
5189         }
5190     }
5191
5192     bool checkContinuous(int dims, const size_t sz[],
5193                          const size_t srcofs[], const size_t srcstep[],
5194                          const size_t dstofs[], const size_t dststep[],
5195                          size_t& total, size_t new_sz[],
5196                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5197                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5198     {
5199         bool iscontinuous = true;
5200         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5201         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5202         total = sz[dims-1];
5203         for( int i = dims-2; i >= 0; i-- )
5204         {
5205             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5206                 iscontinuous = false;
5207             total *= sz[i];
5208             if( srcofs )
5209                 srcrawofs += srcofs[i]*srcstep[i];
5210             if( dstofs )
5211                 dstrawofs += dstofs[i]*dststep[i];
5212         }
5213
5214         if( !iscontinuous )
5215         {
5216             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5217             if( dims == 2 )
5218             {
5219                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5220                 // we assume that new_... arrays are initialized by caller
5221                 // with 0's, so there is no else branch
5222                 if( srcofs )
5223                 {
5224                     new_srcofs[0] = srcofs[1];
5225                     new_srcofs[1] = srcofs[0];
5226                     new_srcofs[2] = 0;
5227                 }
5228
5229                 if( dstofs )
5230                 {
5231                     new_dstofs[0] = dstofs[1];
5232                     new_dstofs[1] = dstofs[0];
5233                     new_dstofs[2] = 0;
5234                 }
5235
5236                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5237                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5238             }
5239             else
5240             {
5241                 // we could check for dims == 3 here,
5242                 // but from user perspective this one is more informative
5243                 CV_Assert(dims <= 3);
5244                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5245                 if( srcofs )
5246                 {
5247                     new_srcofs[0] = srcofs[2];
5248                     new_srcofs[1] = srcofs[1];
5249                     new_srcofs[2] = srcofs[0];
5250                 }
5251
5252                 if( dstofs )
5253                 {
5254                     new_dstofs[0] = dstofs[2];
5255                     new_dstofs[1] = dstofs[1];
5256                     new_dstofs[2] = dstofs[0];
5257                 }
5258
5259                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5260                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5261             }
5262         }
5263         return iscontinuous;
5264     }
5265
5266     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5267                   const size_t srcofs[], const size_t srcstep[],
5268                   const size_t dststep[]) const CV_OVERRIDE
5269     {
5270         if(!u)
5271             return;
5272         UMatDataAutoLock autolock(u);
5273
5274         if( u->data && !u->hostCopyObsolete() )
5275         {
5276             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5277             return;
5278         }
5279         CV_Assert( u->handle != 0 );
5280
5281         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5282
5283         size_t total = 0, new_sz[] = {0, 0, 0};
5284         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5285         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5286
5287         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5288                                             total, new_sz,
5289                                             srcrawofs, new_srcofs, new_srcstep,
5290                                             dstrawofs, new_dstofs, new_dststep);
5291
5292 #ifdef HAVE_OPENCL_SVM
5293         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5294         {
5295             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5296             Context& ctx = Context::getDefault();
5297             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5298             CV_DbgAssert(svmFns->isValid());
5299
5300             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5301             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5302             {
5303                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5304                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5305                         u->handle, u->size,
5306                         0, NULL, NULL);
5307                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5308             }
5309             clFinish(q);
5310             if( iscontinuous )
5311             {
5312                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5313             }
5314             else
5315             {
5316                 // This code is from MatAllocator::download()
5317                 int isz[CV_MAX_DIM];
5318                 uchar* srcptr = (uchar*)u->handle;
5319                 for( int i = 0; i < dims; i++ )
5320                 {
5321                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5322                     if( sz[i] == 0 )
5323                     return;
5324                     if( srcofs )
5325                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5326                     isz[i] = (int)sz[i];
5327                 }
5328
5329                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5330                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5331
5332                 const Mat* arrays[] = { &src, &dst };
5333                 uchar* ptrs[2];
5334                 NAryMatIterator it(arrays, ptrs, 2);
5335                 size_t j, planesz = it.size;
5336
5337                 for( j = 0; j < it.nplanes; j++, ++it )
5338                     memcpy(ptrs[1], ptrs[0], planesz);
5339             }
5340             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5341             {
5342                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5343                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5344                         0, NULL, NULL);
5345                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5346                 clFinish(q);
5347             }
5348         }
5349         else
5350 #endif
5351         {
5352             if( iscontinuous )
5353             {
5354                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5355                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5356                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5357             }
5358             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5359             {
5360                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5361                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5362                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5363                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5364                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5365                 uchar* ptr = alignedPtr.getAlignedPtr();
5366
5367                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5368                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5369                 total = std::min(total, u->size - new_srcrawofs);
5370                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5371                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5372                 for( size_t i = 0; i < new_sz[1]; i++ )
5373                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5374             }
5375             else
5376             {
5377                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5378                 uchar* ptr = alignedPtr.getAlignedPtr();
5379
5380                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5381                     new_srcofs, new_dstofs, new_sz,
5382                     new_srcstep[0], 0,
5383                     new_dststep[0], 0,
5384                     ptr, 0, 0, 0));
5385             }
5386         }
5387     }
5388
5389     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5390                 const size_t dstofs[], const size_t dststep[],
5391                 const size_t srcstep[]) const CV_OVERRIDE
5392     {
5393         if(!u)
5394             return;
5395
5396         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5397         CV_Assert(u->refcount == 0 || u->tempUMat());
5398
5399         size_t total = 0, new_sz[] = {0, 0, 0};
5400         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5401         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5402
5403         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5404                                             total, new_sz,
5405                                             srcrawofs, new_srcofs, new_srcstep,
5406                                             dstrawofs, new_dstofs, new_dststep);
5407
5408         UMatDataAutoLock autolock(u);
5409
5410         // if there is cached CPU copy of the GPU matrix,
5411         // we could use it as a destination.
5412         // we can do it in 2 cases:
5413         //    1. we overwrite the whole content
5414         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5415         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5416         {
5417             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5418             u->markHostCopyObsolete(false);
5419             u->markDeviceCopyObsolete(true);
5420             return;
5421         }
5422
5423         CV_Assert( u->handle != 0 );
5424         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5425
5426 #ifdef HAVE_OPENCL_SVM
5427         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5428         {
5429             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5430             Context& ctx = Context::getDefault();
5431             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5432             CV_DbgAssert(svmFns->isValid());
5433
5434             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5435             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5436             {
5437                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5438                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5439                         u->handle, u->size,
5440                         0, NULL, NULL);
5441                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5442             }
5443             clFinish(q);
5444             if( iscontinuous )
5445             {
5446                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
5447             }
5448             else
5449             {
5450                 // This code is from MatAllocator::upload()
5451                 int isz[CV_MAX_DIM];
5452                 uchar* dstptr = (uchar*)u->handle;
5453                 for( int i = 0; i < dims; i++ )
5454                 {
5455                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5456                     if( sz[i] == 0 )
5457                     return;
5458                     if( dstofs )
5459                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5460                     isz[i] = (int)sz[i];
5461                 }
5462
5463                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
5464                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5465
5466                 const Mat* arrays[] = { &src, &dst };
5467                 uchar* ptrs[2];
5468                 NAryMatIterator it(arrays, ptrs, 2);
5469                 size_t j, planesz = it.size;
5470
5471                 for( j = 0; j < it.nplanes; j++, ++it )
5472                     memcpy(ptrs[1], ptrs[0], planesz);
5473             }
5474             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5475             {
5476                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5477                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5478                         0, NULL, NULL);
5479                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5480                 clFinish(q);
5481             }
5482         }
5483         else
5484 #endif
5485         {
5486             if( iscontinuous )
5487             {
5488                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5489                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5490                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
5491                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
5492                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5493             }
5494             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5495             {
5496                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5497                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5498                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
5499                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
5500                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5501                 uchar* ptr = alignedPtr.getAlignedPtr();
5502
5503                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5504                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
5505                 total = std::min(total, u->size - new_dstrawofs);
5506                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
5507                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
5508                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
5509                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5510                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5511                 for( size_t i = 0; i < new_sz[1]; i++ )
5512                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
5513                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5514                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5515             }
5516             else
5517             {
5518                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5519                 uchar* ptr = alignedPtr.getAlignedPtr();
5520
5521                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5522                     new_dstofs, new_srcofs, new_sz,
5523                     new_dststep[0], 0,
5524                     new_srcstep[0], 0,
5525                     ptr, 0, 0, 0));
5526             }
5527         }
5528         u->markHostCopyObsolete(true);
5529 #ifdef HAVE_OPENCL_SVM
5530         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5531                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5532         {
5533             // nothing
5534         }
5535         else
5536 #endif
5537         {
5538             u->markHostCopyObsolete(true);
5539         }
5540         u->markDeviceCopyObsolete(false);
5541     }
5542
5543     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
5544               const size_t srcofs[], const size_t srcstep[],
5545               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
5546     {
5547         if(!src || !dst)
5548             return;
5549
5550         size_t total = 0, new_sz[] = {0, 0, 0};
5551         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5552         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5553
5554         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
5555                                             total, new_sz,
5556                                             srcrawofs, new_srcofs, new_srcstep,
5557                                             dstrawofs, new_dstofs, new_dststep);
5558
5559         UMatDataAutoLock src_autolock(src, dst);
5560
5561         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5562         {
5563             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5564             return;
5565         }
5566         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5567         {
5568             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5569             dst->markHostCopyObsolete(false);
5570 #ifdef HAVE_OPENCL_SVM
5571             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5572                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5573             {
5574                 // nothing
5575             }
5576             else
5577 #endif
5578             {
5579                 dst->markDeviceCopyObsolete(true);
5580             }
5581             return;
5582         }
5583
5584         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5585         CV_Assert(dst->refcount == 0);
5586         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5587
5588         cl_int retval = CL_SUCCESS;
5589 #ifdef HAVE_OPENCL_SVM
5590         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5591                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5592         {
5593             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5594                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5595             {
5596                 Context& ctx = Context::getDefault();
5597                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5598                 CV_DbgAssert(svmFns->isValid());
5599
5600                 if( iscontinuous )
5601                 {
5602                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5603                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5604                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5605                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5606                             total, 0, NULL, NULL);
5607                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
5608                 }
5609                 else
5610                 {
5611                     clFinish(q);
5612                     // This code is from MatAllocator::download()/upload()
5613                     int isz[CV_MAX_DIM];
5614                     uchar* srcptr = (uchar*)src->handle;
5615                     for( int i = 0; i < dims; i++ )
5616                     {
5617                         CV_Assert( sz[i] <= (size_t)INT_MAX );
5618                         if( sz[i] == 0 )
5619                         return;
5620                         if( srcofs )
5621                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5622                         isz[i] = (int)sz[i];
5623                     }
5624                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5625
5626                     uchar* dstptr = (uchar*)dst->handle;
5627                     for( int i = 0; i < dims; i++ )
5628                     {
5629                         if( dstofs )
5630                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5631                     }
5632                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5633
5634                     const Mat* arrays[] = { &m_src, &m_dst };
5635                     uchar* ptrs[2];
5636                     NAryMatIterator it(arrays, ptrs, 2);
5637                     size_t j, planesz = it.size;
5638
5639                     for( j = 0; j < it.nplanes; j++, ++it )
5640                         memcpy(ptrs[1], ptrs[0], planesz);
5641                 }
5642             }
5643             else
5644             {
5645                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5646                 {
5647                     map(src, ACCESS_READ);
5648                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5649                     unmap(src);
5650                 }
5651                 else
5652                 {
5653                     map(dst, ACCESS_WRITE);
5654                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5655                     unmap(dst);
5656                 }
5657             }
5658         }
5659         else
5660 #endif
5661         {
5662             if( iscontinuous )
5663             {
5664                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5665                                                srcrawofs, dstrawofs, total, 0, 0, 0);
5666                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
5667                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
5668             }
5669             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5670             {
5671                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5672                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5673                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
5674                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5675                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
5676
5677                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5678                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5679                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
5680                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5681                 uchar* srcptr = srcBuf.getAlignedPtr();
5682                 uchar* dstptr = dstBuf.getAlignedPtr();
5683
5684                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5685
5686                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
5687                 src_total = std::min(src_total, src->size - new_srcrawofs);
5688                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
5689                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
5690
5691                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
5692                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
5693                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5694                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5695
5696                 for( size_t i = 0; i < new_sz[1]; i++ )
5697                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
5698                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
5699                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5700                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5701             }
5702             else
5703             {
5704                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5705                                                    new_srcofs, new_dstofs, new_sz,
5706                                                    new_srcstep[0], 0,
5707                                                    new_dststep[0], 0,
5708                                                    0, 0, 0));
5709             }
5710         }
5711         if (retval == CL_SUCCESS)
5712         {
5713             CV_IMPL_ADD(CV_IMPL_OCL)
5714         }
5715
5716 #ifdef HAVE_OPENCL_SVM
5717         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5718             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5719         {
5720             // nothing
5721         }
5722         else
5723 #endif
5724         {
5725             dst->markHostCopyObsolete(true);
5726         }
5727         dst->markDeviceCopyObsolete(false);
5728
5729         if( _sync )
5730         {
5731             CV_OCL_DBG_CHECK(clFinish(q));
5732         }
5733     }
5734
5735     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE {
5736 #ifdef HAVE_OPENCL_SVM
5737         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5738         {
5739             return &bufferPoolSVM;
5740         }
5741 #endif
5742         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5743         {
5744             return &bufferPoolHostPtr;
5745         }
5746         if (id != NULL && strcmp(id, "OCL") != 0)
5747         {
5748             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5749         }
5750         return &bufferPool;
5751     }
5752
5753     MatAllocator* matStdAllocator;
5754
5755     mutable cv::Mutex cleanupQueueMutex;
5756     mutable std::deque<UMatData*> cleanupQueue;
5757
5758     void flushCleanupQueue() const
5759     {
5760         if (!cleanupQueue.empty())
5761         {
5762             std::deque<UMatData*> q;
5763             {
5764                 cv::AutoLock lock(cleanupQueueMutex);
5765                 q.swap(cleanupQueue);
5766             }
5767             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
5768             {
5769                 deallocate_(*i);
5770             }
5771         }
5772     }
5773     void addToCleanupQueue(UMatData* u) const
5774     {
5775         //TODO: Validation check: CV_Assert(!u->tempUMat());
5776         {
5777             cv::AutoLock lock(cleanupQueueMutex);
5778             cleanupQueue.push_back(u);
5779         }
5780     }
5781 };
5782
5783 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
5784 {
5785     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
5786     g_isOpenCVActivated = true;
5787     return g_allocator;
5788 }
5789 MatAllocator* getOpenCLAllocator()
5790 {
5791     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
5792 }
5793
5794 }} // namespace cv::ocl
5795
5796
5797 namespace cv {
5798
5799 // three funcs below are implemented in umatrix.cpp
5800 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
5801               bool autoSteps = false );
5802 void finalizeHdr(UMat& m);
5803
5804 } // namespace cv
5805
5806
5807 namespace cv { namespace ocl {
5808
5809 /*
5810 // Convert OpenCL buffer memory to UMat
5811 */
5812 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
5813 {
5814     int d = 2;
5815     int sizes[] = { rows, cols };
5816
5817     CV_Assert(0 <= d && d <= CV_MAX_DIM);
5818
5819     dst.release();
5820
5821     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
5822     dst.usageFlags = USAGE_DEFAULT;
5823
5824     setSize(dst, d, sizes, 0, true);
5825     dst.offset = 0;
5826
5827     cl_mem             memobj = (cl_mem)cl_mem_buffer;
5828     cl_mem_object_type mem_type = 0;
5829
5830     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5831
5832     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
5833
5834     size_t total = 0;
5835     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
5836
5837     CV_OCL_CHECK(clRetainMemObject(memobj));
5838
5839     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
5840     CV_Assert(total >= rows * step);
5841
5842     // attach clBuffer to UMatData
5843     dst.u = new UMatData(getOpenCLAllocator());
5844     dst.u->data            = 0;
5845     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
5846     dst.u->flags           = 0;
5847     dst.u->handle          = cl_mem_buffer;
5848     dst.u->origdata        = 0;
5849     dst.u->prevAllocator   = 0;
5850     dst.u->size            = total;
5851
5852     finalizeHdr(dst);
5853     dst.addref();
5854
5855     return;
5856 } // convertFromBuffer()
5857
5858
5859 /*
5860 // Convert OpenCL image2d_t memory to UMat
5861 */
5862 void convertFromImage(void* cl_mem_image, UMat& dst)
5863 {
5864     cl_mem             clImage = (cl_mem)cl_mem_image;
5865     cl_mem_object_type mem_type = 0;
5866
5867     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5868
5869     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
5870
5871     cl_image_format fmt = { 0, 0 };
5872     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
5873
5874     int depth = CV_8U;
5875     switch (fmt.image_channel_data_type)
5876     {
5877     case CL_UNORM_INT8:
5878     case CL_UNSIGNED_INT8:
5879         depth = CV_8U;
5880         break;
5881
5882     case CL_SNORM_INT8:
5883     case CL_SIGNED_INT8:
5884         depth = CV_8S;
5885         break;
5886
5887     case CL_UNORM_INT16:
5888     case CL_UNSIGNED_INT16:
5889         depth = CV_16U;
5890         break;
5891
5892     case CL_SNORM_INT16:
5893     case CL_SIGNED_INT16:
5894         depth = CV_16S;
5895         break;
5896
5897     case CL_SIGNED_INT32:
5898         depth = CV_32S;
5899         break;
5900
5901     case CL_FLOAT:
5902         depth = CV_32F;
5903         break;
5904
5905     default:
5906         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
5907     }
5908
5909     int type = CV_8UC1;
5910     switch (fmt.image_channel_order)
5911     {
5912     case CL_R:
5913         type = CV_MAKE_TYPE(depth, 1);
5914         break;
5915
5916     case CL_RGBA:
5917     case CL_BGRA:
5918     case CL_ARGB:
5919         type = CV_MAKE_TYPE(depth, 4);
5920         break;
5921
5922     default:
5923         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
5924         break;
5925     }
5926
5927     size_t step = 0;
5928     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
5929
5930     size_t w = 0;
5931     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
5932
5933     size_t h = 0;
5934     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
5935
5936     dst.create((int)h, (int)w, type);
5937
5938     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
5939
5940     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5941
5942     size_t offset = 0;
5943     size_t src_origin[3] = { 0, 0, 0 };
5944     size_t region[3] = { w, h, 1 };
5945     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
5946
5947     CV_OCL_CHECK(clFinish(q));
5948
5949     return;
5950 } // convertFromImage()
5951
5952
5953 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5954
5955 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5956 {
5957     cl_uint numDevices = 0;
5958     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
5959     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
5960     {
5961         CV_OCL_DBG_CHECK_RESULT(status,
5962             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
5963     }
5964
5965     if (numDevices == 0)
5966     {
5967         devices.clear();
5968         return;
5969     }
5970
5971     devices.resize((size_t)numDevices);
5972     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
5973 }
5974
5975 struct PlatformInfo::Impl
5976 {
5977     Impl(void* id)
5978     {
5979         refcount = 1;
5980         handle = *(cl_platform_id*)id;
5981         getDevices(devices, handle);
5982
5983         version_ = getStrProp(CL_PLATFORM_VERSION);
5984         parseOpenCLVersion(version_, versionMajor_, versionMinor_);
5985     }
5986
5987     String getStrProp(cl_platform_info prop) const
5988     {
5989         char buf[1024];
5990         size_t sz=0;
5991         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5992             sz < sizeof(buf) ? String(buf) : String();
5993     }
5994
5995     IMPLEMENT_REFCOUNTABLE();
5996     std::vector<cl_device_id> devices;
5997     cl_platform_id handle;
5998
5999     String version_;
6000     int versionMajor_;
6001     int versionMinor_;
6002 };
6003
6004 PlatformInfo::PlatformInfo()
6005 {
6006     p = 0;
6007 }
6008
6009 PlatformInfo::PlatformInfo(void* platform_id)
6010 {
6011     p = new Impl(platform_id);
6012 }
6013
6014 PlatformInfo::~PlatformInfo()
6015 {
6016     if(p)
6017         p->release();
6018 }
6019
6020 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6021 {
6022     if (i.p)
6023         i.p->addref();
6024     p = i.p;
6025 }
6026
6027 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6028 {
6029     if (i.p != p)
6030     {
6031         if (i.p)
6032             i.p->addref();
6033         if (p)
6034             p->release();
6035         p = i.p;
6036     }
6037     return *this;
6038 }
6039
6040 int PlatformInfo::deviceNumber() const
6041 {
6042     return p ? (int)p->devices.size() : 0;
6043 }
6044
6045 void PlatformInfo::getDevice(Device& device, int d) const
6046 {
6047     CV_Assert(p && d < (int)p->devices.size() );
6048     if(p)
6049         device.set(p->devices[d]);
6050 }
6051
6052 String PlatformInfo::name() const
6053 {
6054     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6055 }
6056
6057 String PlatformInfo::vendor() const
6058 {
6059     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6060 }
6061
6062 String PlatformInfo::version() const
6063 {
6064     return p ? p->version_ : String();
6065 }
6066
6067 int PlatformInfo::versionMajor() const
6068 {
6069     CV_Assert(p);
6070     return p->versionMajor_;
6071 }
6072
6073 int PlatformInfo::versionMinor() const
6074 {
6075     CV_Assert(p);
6076     return p->versionMinor_;
6077 }
6078
6079 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6080 {
6081     cl_uint numPlatforms = 0;
6082     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6083
6084     if (numPlatforms == 0)
6085     {
6086         platforms.clear();
6087         return;
6088     }
6089
6090     platforms.resize((size_t)numPlatforms);
6091     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6092 }
6093
6094 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6095 {
6096     std::vector<cl_platform_id> platforms;
6097     getPlatforms(platforms);
6098
6099     for (size_t i = 0; i < platforms.size(); i++)
6100         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6101 }
6102
6103 const char* typeToStr(int type)
6104 {
6105     static const char* tab[]=
6106     {
6107         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6108         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6109         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6110         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6111         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6112         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6113         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6114         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6115     };
6116     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6117     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6118 }
6119
6120 const char* memopTypeToStr(int type)
6121 {
6122     static const char* tab[] =
6123     {
6124         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6125         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6126         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6127         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6128         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6129         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6130         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6131         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6132     };
6133     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6134     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6135 }
6136
6137 const char* vecopTypeToStr(int type)
6138 {
6139     static const char* tab[] =
6140     {
6141         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6142         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6143         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6144         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6145         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6146         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6147         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6148         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6149     };
6150     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6151     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6152 }
6153
6154 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6155 {
6156     if( sdepth == ddepth )
6157         return "noconvert";
6158     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6159     if( ddepth >= CV_32F ||
6160         (ddepth == CV_32S && sdepth < CV_32S) ||
6161         (ddepth == CV_16S && sdepth <= CV_8S) ||
6162         (ddepth == CV_16U && sdepth == CV_8U))
6163     {
6164         sprintf(buf, "convert_%s", typestr);
6165     }
6166     else if( sdepth >= CV_32F )
6167         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6168     else
6169         sprintf(buf, "convert_%s_sat", typestr);
6170
6171     return buf;
6172 }
6173
6174 const char* getOpenCLErrorString(int errorCode)
6175 {
6176 #define CV_OCL_CODE(id) case id: return #id
6177 #define CV_OCL_CODE_(id, name) case id: return #name
6178     switch (errorCode)
6179     {
6180     CV_OCL_CODE(CL_SUCCESS);
6181     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6182     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6183     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6184     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6185     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6186     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6187     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6188     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6189     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6190     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6191     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6192     CV_OCL_CODE(CL_MAP_FAILURE);
6193     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6194     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6195     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6196     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6197     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6198     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6199     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6200     CV_OCL_CODE(CL_INVALID_VALUE);
6201     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6202     CV_OCL_CODE(CL_INVALID_PLATFORM);
6203     CV_OCL_CODE(CL_INVALID_DEVICE);
6204     CV_OCL_CODE(CL_INVALID_CONTEXT);
6205     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6206     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6207     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6208     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6209     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6210     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6211     CV_OCL_CODE(CL_INVALID_SAMPLER);
6212     CV_OCL_CODE(CL_INVALID_BINARY);
6213     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6214     CV_OCL_CODE(CL_INVALID_PROGRAM);
6215     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6216     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6217     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6218     CV_OCL_CODE(CL_INVALID_KERNEL);
6219     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6220     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6221     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6222     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6223     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6224     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6225     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6226     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6227     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6228     CV_OCL_CODE(CL_INVALID_EVENT);
6229     CV_OCL_CODE(CL_INVALID_OPERATION);
6230     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6231     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6232     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6233     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6234     // OpenCL 1.1
6235     CV_OCL_CODE(CL_INVALID_PROPERTY);
6236     // OpenCL 1.2
6237     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6238     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6239     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6240     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6241     // OpenCL 2.0
6242     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6243     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6244     // Extensions
6245     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6246     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6247     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6248     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6249     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6250     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6251     default: return "Unknown OpenCL error";
6252     }
6253 #undef CV_OCL_CODE
6254 #undef CV_OCL_CODE_
6255 }
6256
6257 template <typename T>
6258 static std::string kerToStr(const Mat & k)
6259 {
6260     int width = k.cols - 1, depth = k.depth();
6261     const T * const data = k.ptr<T>();
6262
6263     std::ostringstream stream;
6264     stream.precision(10);
6265
6266     if (depth <= CV_8S)
6267     {
6268         for (int i = 0; i < width; ++i)
6269             stream << "DIG(" << (int)data[i] << ")";
6270         stream << "DIG(" << (int)data[width] << ")";
6271     }
6272     else if (depth == CV_32F)
6273     {
6274         stream.setf(std::ios_base::showpoint);
6275         for (int i = 0; i < width; ++i)
6276             stream << "DIG(" << data[i] << "f)";
6277         stream << "DIG(" << data[width] << "f)";
6278     }
6279     else
6280     {
6281         for (int i = 0; i < width; ++i)
6282             stream << "DIG(" << data[i] << ")";
6283         stream << "DIG(" << data[width] << ")";
6284     }
6285
6286     return stream.str();
6287 }
6288
6289 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6290 {
6291     Mat kernel = _kernel.getMat().reshape(1, 1);
6292
6293     int depth = kernel.depth();
6294     if (ddepth < 0)
6295         ddepth = depth;
6296
6297     if (ddepth != depth)
6298         kernel.convertTo(kernel, ddepth);
6299
6300     typedef std::string (* func_t)(const Mat &);
6301     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6302                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6303     const func_t func = funcs[ddepth];
6304     CV_Assert(func != 0);
6305
6306     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6307 }
6308
6309 #define PROCESS_SRC(src) \
6310     do \
6311     { \
6312         if (!src.empty()) \
6313         { \
6314             CV_Assert(src.isMat() || src.isUMat()); \
6315             Size csize = src.size(); \
6316             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6317                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6318             if (cwidth < ckercn || ckercn <= 0) \
6319                 return 1; \
6320             cols.push_back(cwidth); \
6321             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6322                 return 1; \
6323             offsets.push_back(src.offset()); \
6324             steps.push_back(src.step()); \
6325             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6326             kercns.push_back(ckercn); \
6327         } \
6328     } \
6329     while ((void)0, 0)
6330
6331 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6332                               InputArray src4, InputArray src5, InputArray src6,
6333                               InputArray src7, InputArray src8, InputArray src9,
6334                               OclVectorStrategy strat)
6335 {
6336     const ocl::Device & d = ocl::Device::getDefault();
6337
6338     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6339         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6340         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6341         d.preferredVectorWidthDouble(), -1 };
6342
6343     // if the device says don't use vectors
6344     if (vectorWidths[0] == 1)
6345     {
6346         // it's heuristic
6347         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6348         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6349         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6350     }
6351
6352     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6353 }
6354
6355 int checkOptimalVectorWidth(const int *vectorWidths,
6356                             InputArray src1, InputArray src2, InputArray src3,
6357                             InputArray src4, InputArray src5, InputArray src6,
6358                             InputArray src7, InputArray src8, InputArray src9,
6359                             OclVectorStrategy strat)
6360 {
6361     CV_Assert(vectorWidths);
6362
6363     int ref_type = src1.type();
6364
6365     std::vector<size_t> offsets, steps, cols;
6366     std::vector<int> dividers, kercns;
6367     PROCESS_SRC(src1);
6368     PROCESS_SRC(src2);
6369     PROCESS_SRC(src3);
6370     PROCESS_SRC(src4);
6371     PROCESS_SRC(src5);
6372     PROCESS_SRC(src6);
6373     PROCESS_SRC(src7);
6374     PROCESS_SRC(src8);
6375     PROCESS_SRC(src9);
6376
6377     size_t size = offsets.size();
6378
6379     for (size_t i = 0; i < size; ++i)
6380         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6381             dividers[i] >>= 1, kercns[i] >>= 1;
6382
6383     // default strategy
6384     int kercn = *std::min_element(kercns.begin(), kercns.end());
6385
6386     return kercn;
6387 }
6388
6389 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6390                                  InputArray src4, InputArray src5, InputArray src6,
6391                                  InputArray src7, InputArray src8, InputArray src9)
6392 {
6393     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6394 }
6395
6396 #undef PROCESS_SRC
6397
6398
6399 // TODO Make this as a method of OpenCL "BuildOptions" class
6400 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6401 {
6402     if (!buildOptions.empty())
6403         buildOptions += " ";
6404     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6405     buildOptions += format(
6406             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6407             name.c_str(), ocl::typeToStr(type),
6408             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6409             name.c_str(), (int)CV_MAT_CN(type),
6410             name.c_str(), (int)CV_ELEM_SIZE(type),
6411             name.c_str(), (int)CV_ELEM_SIZE1(type),
6412             name.c_str(), (int)depth
6413             );
6414 }
6415
6416
6417 struct Image2D::Impl
6418 {
6419     Impl(const UMat &src, bool norm, bool alias)
6420     {
6421         handle = 0;
6422         refcount = 1;
6423         init(src, norm, alias);
6424     }
6425
6426     ~Impl()
6427     {
6428         if (handle)
6429             clReleaseMemObject(handle);
6430     }
6431
6432     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6433     {
6434         cl_image_format format;
6435         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6436                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6437         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6438                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6439         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6440
6441         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6442         int channelOrder = channelOrders[cn];
6443         format.image_channel_data_type = (cl_channel_type)channelType;
6444         format.image_channel_order = (cl_channel_order)channelOrder;
6445         return format;
6446     }
6447
6448     static bool isFormatSupported(cl_image_format format)
6449     {
6450         if (!haveOpenCL())
6451             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6452
6453         cl_context context = (cl_context)Context::getDefault().ptr();
6454         if (!context)
6455             return false;
6456
6457         // Figure out how many formats are supported by this context.
6458         cl_uint numFormats = 0;
6459         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6460                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
6461                                                 NULL, &numFormats);
6462         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
6463         if (numFormats > 0)
6464         {
6465             AutoBuffer<cl_image_format> formats(numFormats);
6466             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6467                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
6468                                              formats.data(), NULL);
6469             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
6470             for (cl_uint i = 0; i < numFormats; ++i)
6471             {
6472                 if (!memcmp(&formats[i], &format, sizeof(format)))
6473                 {
6474                     return true;
6475                 }
6476             }
6477         }
6478         return false;
6479     }
6480
6481     void init(const UMat &src, bool norm, bool alias)
6482     {
6483         if (!haveOpenCL())
6484             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6485
6486         CV_Assert(!src.empty());
6487         CV_Assert(ocl::Device::getDefault().imageSupport());
6488
6489         int err, depth = src.depth(), cn = src.channels();
6490         CV_Assert(cn <= 4);
6491         cl_image_format format = getImageFormat(depth, cn, norm);
6492
6493         if (!isFormatSupported(format))
6494             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
6495
6496         if (alias && !src.handle(ACCESS_RW))
6497             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
6498
6499         cl_context context = (cl_context)Context::getDefault().ptr();
6500         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
6501
6502 #ifdef CL_VERSION_1_2
6503         // this enables backwards portability to
6504         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
6505         const Device & d = ocl::Device::getDefault();
6506         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
6507         CV_Assert(!alias || canCreateAlias(src));
6508         if (1 < major || (1 == major && 2 <= minor))
6509         {
6510             cl_image_desc desc;
6511             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
6512             desc.image_width      = src.cols;
6513             desc.image_height     = src.rows;
6514             desc.image_depth      = 0;
6515             desc.image_array_size = 1;
6516             desc.image_row_pitch  = alias ? src.step[0] : 0;
6517             desc.image_slice_pitch = 0;
6518             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
6519             desc.num_mip_levels   = 0;
6520             desc.num_samples      = 0;
6521             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
6522         }
6523         else
6524 #endif
6525         {
6526             CV_SUPPRESS_DEPRECATED_START
6527             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
6528             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
6529             CV_SUPPRESS_DEPRECATED_END
6530         }
6531         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
6532
6533         size_t origin[] = { 0, 0, 0 };
6534         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
6535
6536         cl_mem devData;
6537         if (!alias && !src.isContinuous())
6538         {
6539             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
6540             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
6541                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
6542                 ).c_str());
6543
6544             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
6545             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
6546                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
6547             CV_OCL_DBG_CHECK(clFlush(queue));
6548         }
6549         else
6550         {
6551             devData = (cl_mem)src.handle(ACCESS_READ);
6552         }
6553         CV_Assert(devData != NULL);
6554
6555         if (!alias)
6556         {
6557             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
6558             if (!src.isContinuous())
6559             {
6560                 CV_OCL_DBG_CHECK(clFlush(queue));
6561                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
6562             }
6563         }
6564     }
6565
6566     IMPLEMENT_REFCOUNTABLE();
6567
6568     cl_mem handle;
6569 };
6570
6571 Image2D::Image2D()
6572 {
6573     p = NULL;
6574 }
6575
6576 Image2D::Image2D(const UMat &src, bool norm, bool alias)
6577 {
6578     p = new Impl(src, norm, alias);
6579 }
6580
6581 bool Image2D::canCreateAlias(const UMat &m)
6582 {
6583     bool ret = false;
6584     const Device & d = ocl::Device::getDefault();
6585     if (d.imageFromBufferSupport() && !m.empty())
6586     {
6587         // This is the required pitch alignment in pixels
6588         uint pitchAlign = d.imagePitchAlignment();
6589         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
6590         {
6591             // We don't currently handle the case where the buffer was created
6592             // with CL_MEM_USE_HOST_PTR
6593             if (!m.u->tempUMat())
6594             {
6595                 ret = true;
6596             }
6597         }
6598     }
6599     return ret;
6600 }
6601
6602 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
6603 {
6604     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
6605
6606     return Impl::isFormatSupported(format);
6607 }
6608
6609 Image2D::Image2D(const Image2D & i)
6610 {
6611     p = i.p;
6612     if (p)
6613         p->addref();
6614 }
6615
6616 Image2D & Image2D::operator = (const Image2D & i)
6617 {
6618     if (i.p != p)
6619     {
6620         if (i.p)
6621             i.p->addref();
6622         if (p)
6623             p->release();
6624         p = i.p;
6625     }
6626     return *this;
6627 }
6628
6629 Image2D::~Image2D()
6630 {
6631     if (p)
6632         p->release();
6633 }
6634
6635 void* Image2D::ptr() const
6636 {
6637     return p ? p->handle : 0;
6638 }
6639
6640 bool internal::isOpenCLForced()
6641 {
6642     static bool initialized = false;
6643     static bool value = false;
6644     if (!initialized)
6645     {
6646         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
6647         initialized = true;
6648     }
6649     return value;
6650 }
6651
6652 bool internal::isPerformanceCheckBypassed()
6653 {
6654     static bool initialized = false;
6655     static bool value = false;
6656     if (!initialized)
6657     {
6658         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
6659         initialized = true;
6660     }
6661     return value;
6662 }
6663
6664 bool internal::isCLBuffer(UMat& u)
6665 {
6666     void* h = u.handle(ACCESS_RW);
6667     if (!h)
6668         return true;
6669     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
6670 #if 1
6671     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
6672         return false;
6673 #else
6674     cl_mem_object_type type = 0;
6675     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
6676     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
6677         return false;
6678 #endif
6679     return true;
6680 }
6681
6682 struct Timer::Impl
6683 {
6684     const Queue queue;
6685
6686     Impl(const Queue& q)
6687         : queue(q)
6688     {
6689     }
6690
6691     ~Impl(){}
6692
6693     void start()
6694     {
6695         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6696         timer.start();
6697     }
6698
6699     void stop()
6700     {
6701         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6702         timer.stop();
6703     }
6704
6705     uint64 durationNS() const
6706     {
6707         return (uint64)(timer.getTimeSec() * 1e9);
6708     }
6709
6710     TickMeter timer;
6711 };
6712
6713 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
6714 Timer::~Timer() { delete p; }
6715
6716 void Timer::start()
6717 {
6718     CV_Assert(p);
6719     p->start();
6720 }
6721
6722 void Timer::stop()
6723 {
6724     CV_Assert(p);
6725     p->stop();
6726 }
6727
6728 uint64 Timer::durationNS() const
6729 {
6730     CV_Assert(p);
6731     return p->durationNS();
6732 }
6733
6734 }} // namespace
6735
6736 #endif // HAVE_OPENCL