Merge pull request #18234 from l-bat:onnx_reshape
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_OPENCL_SVM
117 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
118 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
119 #include "opencv2/core/opencl/opencl_svm.hpp"
120 #endif
121
122 #include "umatrix.hpp"
123
124 namespace cv { namespace ocl {
125
126 #define IMPLEMENT_REFCOUNTABLE() \
127     void addref() { CV_XADD(&refcount, 1); } \
128     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
129     int refcount
130
131 static cv::utils::AllocatorStatistics opencl_allocator_stats;
132
133 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
134 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
135 {
136     return opencl_allocator_stats;
137 }
138
139 #ifndef _DEBUG
140 static bool isRaiseError()
141 {
142     static bool initialized = false;
143     static bool value = false;
144     if (!initialized)
145     {
146         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
147         initialized = true;
148     }
149     return value;
150 }
151 #endif
152
153 #if CV_OPENCL_TRACE_CHECK
154 static inline
155 void traceOpenCLCheck(cl_int status, const char* message)
156 {
157     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
158 }
159 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
160 #else
161 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
162 #endif
163
164 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
165     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
166
167 #define CV_OCL_CHECK_RESULT(check_result, msg) \
168     do { \
169         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
170         if (check_result != CL_SUCCESS) \
171         { \
172             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
173             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
174             CV_Error(Error::OpenCLApiCallError, error_msg); \
175         } \
176     } while (0)
177
178 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
179
180 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
181
182 #ifdef _DEBUG
183 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
184 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
185 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
186 #else
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
188     do { \
189         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
190         if (check_result != CL_SUCCESS && isRaiseError()) \
191         { \
192             if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
193             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
194             CV_Error(Error::OpenCLApiCallError, error_msg); \
195         } \
196     } while (0)
197 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
198 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
199 #endif
200
201
202 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
203 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
204 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
205 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
206
207 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
208 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
209 #endif
210
211 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
212 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
213 #ifdef __APPLE__
214         true
215 #else
216         false
217 #endif
218 );
219
220 static const String getBuildExtraOptions()
221 {
222     static String param_buildExtraOptions;
223     static bool initialized = false;
224     if (!initialized)
225     {
226         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
227         initialized = true;
228         if (!param_buildExtraOptions.empty())
229             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
230     }
231     return param_buildExtraOptions;
232 }
233
234 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
235 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
236
237
238 struct UMat2D
239 {
240     UMat2D(const UMat& m)
241     {
242         offset = (int)m.offset;
243         step = (int)m.step;
244         rows = m.rows;
245         cols = m.cols;
246     }
247     int offset;
248     int step;
249     int rows;
250     int cols;
251 };
252
253 struct UMat3D
254 {
255     UMat3D(const UMat& m)
256     {
257         offset = (int)m.offset;
258         step = (int)m.step.p[1];
259         slicestep = (int)m.step.p[0];
260         slices = (int)m.size.p[0];
261         rows = m.size.p[1];
262         cols = m.size.p[2];
263     }
264     int offset;
265     int slicestep;
266     int step;
267     int slices;
268     int rows;
269     int cols;
270 };
271
272 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
273 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
274 {
275     static uint64 table[256];
276     static bool initialized = false;
277
278     if( !initialized )
279     {
280         for( int i = 0; i < 256; i++ )
281         {
282             uint64 c = i;
283             for( int j = 0; j < 8; j++ )
284                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
285             table[i] = c;
286         }
287         initialized = true;
288     }
289
290     uint64 crc = ~crc0;
291     for( size_t idx = 0; idx < size; idx++ )
292         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
293
294     return ~crc;
295 }
296
297 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
298 struct OpenCLBinaryCacheConfigurator
299 {
300     cv::String cache_path_;
301     cv::String cache_lock_filename_;
302     cv::Ptr<utils::fs::FileLock> cache_lock_;
303
304     typedef std::map<std::string, std::string> ContextCacheType;
305     ContextCacheType prepared_contexts_;
306     Mutex mutex_prepared_contexts_;
307
308     OpenCLBinaryCacheConfigurator()
309     {
310         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
311         if (!CV_OPENCL_CACHE_ENABLE)
312         {
313             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
314             return;
315         }
316         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
317         if (cache_path_.empty())
318         {
319             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
320         }
321         do
322         {
323             try
324             {
325                 if (cache_path_.empty())
326                     break;
327                 if (cache_path_ == "disabled")
328                     break;
329                 if (!utils::fs::createDirectories(cache_path_))
330                 {
331                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
332                     clear();
333                     break;
334                 }
335
336                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
337                 {
338                     cache_lock_filename_ = cache_path_ + ".lock";
339                     if (!utils::fs::exists(cache_lock_filename_))
340                     {
341                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
342                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
343                         if (!lock_filename.is_open())
344                         {
345                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
346                             break;
347                         }
348                     }
349
350                     try
351                     {
352                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
353                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
354                         {
355                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
356                         }
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
358                     }
359                     catch (const cv::Exception& e)
360                     {
361                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
362                     }
363                     catch (...)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
366                     }
367                 }
368                 else
369                 {
370                     if (CV_OPENCL_CACHE_WRITE)
371                     {
372                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
373                                 "(not safe for multiprocess environment)");
374                     }
375                     else
376                     {
377                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
378                     }
379                 }
380             }
381             catch (const cv::Exception& e)
382             {
383                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
384                 clear();
385             }
386         } while (0);
387         if (!cache_path_.empty())
388         {
389             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
390             {
391                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
392                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
393             }
394             else
395             {
396                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
397             }
398         }
399     }
400
401     void clear()
402     {
403         cache_path_.clear();
404         cache_lock_filename_.clear();
405         cache_lock_.release();
406     }
407
408     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
409             const std::string& cleanup_prefix)
410     {
411         if (cache_path_.empty())
412             return std::string();
413
414         AutoLock lock(mutex_prepared_contexts_);
415
416         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
417         if (found_it != prepared_contexts_.end())
418             return found_it->second;
419
420         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
421
422         std::string target_directory = cache_path_ + ctx_prefix + "/";
423         bool result = utils::fs::isDirectory(target_directory);
424         if (!result)
425         {
426             try
427             {
428                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
429                 if (utils::fs::createDirectories(target_directory))
430                 {
431                     result = true;
432                 }
433                 else
434                 {
435                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
436                 }
437             }
438             catch (const cv::Exception& e)
439             {
440                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
441             }
442         }
443         target_directory = result ? target_directory : std::string();
444         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
445
446         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
447         {
448             try
449             {
450                 std::vector<String> entries;
451                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
452                 std::vector<String> remove_entries;
453                 for (size_t i = 0; i < entries.size(); i++)
454                 {
455                     const String& name = entries[i];
456                     if (0 == name.find(cleanup_prefix))
457                     {
458                         if (0 == name.find(ctx_prefix))
459                             continue; // skip current
460                         remove_entries.push_back(name);
461                     }
462                 }
463                 if (!remove_entries.empty())
464                 {
465                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
466                             ? "Detected OpenCL cache directory for other version of OpenCL device."
467                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
468                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
469                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
470                     for (size_t i = 0; i < remove_entries.size(); i++)
471                     {
472                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
473                     }
474                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
475
476                     for (size_t i = 0; i < remove_entries.size(); i++)
477                     {
478                         const String& name = remove_entries[i];
479                         cv::String path = utils::fs::join(cache_path_, name);
480                         try
481                         {
482                             utils::fs::remove_all(path);
483                             CV_LOG_WARNING(NULL, "Removed: " << path);
484                         }
485                         catch (const cv::Exception& e)
486                         {
487                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
488                         }
489                     }
490                 }
491             }
492             catch (...)
493             {
494                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
495             }
496         }
497
498         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
499         return target_directory;
500     }
501
502     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
503     {
504         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
505     }
506 };
507 class BinaryProgramFile
508 {
509     enum { MAX_ENTRIES = 64 };
510
511     typedef unsigned int uint32_t;
512
513     struct CV_DECL_ALIGNED(4) FileHeader
514     {
515         uint32_t sourceSignatureSize;
516         //char sourceSignature[];
517     };
518
519     struct CV_DECL_ALIGNED(4) FileTable
520     {
521         uint32_t numberOfEntries;
522         //uint32_t firstEntryOffset[];
523     };
524
525     struct CV_DECL_ALIGNED(4) FileEntry
526     {
527         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
528         uint32_t keySize;
529         uint32_t dataSize;
530         //char key[];
531         //char data[];
532     };
533
534     const std::string fileName_;
535     const char* const sourceSignature_;
536     const size_t sourceSignatureSize_;
537
538     std::fstream f;
539
540     uint32_t entryOffsets[MAX_ENTRIES];
541
542     uint32_t getHash(const std::string& options)
543     {
544         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
545         return hash & (MAX_ENTRIES - 1);
546     }
547
548     inline size_t getFileSize()
549     {
550         size_t pos = (size_t)f.tellg();
551         f.seekg(0, std::fstream::end);
552         size_t fileSize = (size_t)f.tellg();
553         f.seekg(pos, std::fstream::beg);
554         return fileSize;
555     }
556     inline uint32_t readUInt32()
557     {
558         uint32_t res = 0;
559         f.read((char*)&res, sizeof(uint32_t));
560         CV_Assert(!f.fail());
561         return res;
562     }
563     inline void writeUInt32(const uint32_t value)
564     {
565         uint32_t v = value;
566         f.write((char*)&v, sizeof(uint32_t));
567         CV_Assert(!f.fail());
568     }
569
570     inline void seekReadAbsolute(size_t pos)
571     {
572         f.seekg(pos, std::fstream::beg);
573         CV_Assert(!f.fail());
574     }
575     inline void seekReadRelative(size_t pos)
576     {
577         f.seekg(pos, std::fstream::cur);
578         CV_Assert(!f.fail());
579     }
580
581     inline void seekWriteAbsolute(size_t pos)
582     {
583         f.seekp(pos, std::fstream::beg);
584         CV_Assert(!f.fail());
585     }
586
587     void clearFile()
588     {
589         f.close();
590         if (0 != remove(fileName_.c_str()))
591             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
592         return;
593     }
594
595 public:
596     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
597         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
598     {
599         CV_StaticAssert(sizeof(uint32_t) == 4, "");
600         CV_Assert(sourceSignature_ != NULL);
601         CV_Assert(sourceSignatureSize_ > 0);
602         memset(entryOffsets, 0, sizeof(entryOffsets));
603
604         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
605         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
606         if(f.is_open() && getFileSize() > 0)
607         {
608             bool isValid = false;
609             try
610             {
611                 uint32_t fileSourceSignatureSize = readUInt32();
612                 if (fileSourceSignatureSize == sourceSignatureSize_)
613                 {
614                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
615                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
616                     if (f.eof())
617                     {
618                         CV_LOG_ERROR(NULL, "Unexpected EOF");
619                     }
620                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
621                     {
622                         isValid = true;
623                     }
624                 }
625                 if (!isValid)
626                 {
627                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
628                 }
629             }
630             catch (const cv::Exception& e)
631             {
632                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
633             }
634             catch (...)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
637             }
638             if (!isValid)
639             {
640                 clearFile();
641             }
642             else
643             {
644                 seekReadAbsolute(0);
645             }
646         }
647     }
648
649     bool read(const std::string& key, std::vector<char>& buf)
650     {
651         if (!f.is_open())
652             return false;
653
654         size_t fileSize = getFileSize();
655         if (fileSize == 0)
656         {
657             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
658             clearFile();
659             return false;
660         }
661         seekReadAbsolute(0);
662
663         // bypass FileHeader
664         uint32_t fileSourceSignatureSize = readUInt32();
665         CV_Assert(fileSourceSignatureSize > 0);
666         seekReadRelative(fileSourceSignatureSize);
667
668         uint32_t numberOfEntries = readUInt32();
669         CV_Assert(numberOfEntries > 0);
670         if (numberOfEntries != MAX_ENTRIES)
671         {
672             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
673             clearFile();
674             return false;
675         }
676         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
677         CV_Assert(!f.fail());
678
679         uint32_t entryNum = getHash(key);
680
681         uint32_t entryOffset = entryOffsets[entryNum];
682         FileEntry entry;
683         while (entryOffset > 0)
684         {
685             seekReadAbsolute(entryOffset);
686             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
687             f.read((char*)&entry, sizeof(entry));
688             CV_Assert(!f.fail());
689             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
690             if (key.size() == entry.keySize)
691             {
692                 if (entry.keySize > 0)
693                 {
694                     f.read(fileKey.data(), entry.keySize);
695                     CV_Assert(!f.fail());
696                 }
697                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
698                 {
699                     buf.resize(entry.dataSize);
700                     f.read(&buf[0], entry.dataSize);
701                     CV_Assert(!f.fail());
702                     seekReadAbsolute(0);
703                     CV_LOG_VERBOSE(NULL, 0, "Read...");
704                     return true;
705                 }
706             }
707             if (entry.nextEntryFileOffset == 0)
708                 break;
709             entryOffset = entry.nextEntryFileOffset;
710         }
711         return false;
712     }
713
714     bool write(const std::string& key, std::vector<char>& buf)
715     {
716         if (!f.is_open())
717         {
718             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
719             if (!f.is_open())
720             {
721                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
722                 if (!f.is_open())
723                 {
724                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
725                     return false;
726                 }
727             }
728         }
729
730         size_t fileSize = getFileSize();
731         if (fileSize == 0)
732         {
733             // Write header
734             seekWriteAbsolute(0);
735             writeUInt32((uint32_t)sourceSignatureSize_);
736             f.write(sourceSignature_, sourceSignatureSize_);
737             CV_Assert(!f.fail());
738
739             writeUInt32(MAX_ENTRIES);
740             memset(entryOffsets, 0, sizeof(entryOffsets));
741             f.write((char*)entryOffsets, sizeof(entryOffsets));
742             CV_Assert(!f.fail());
743             f.flush();
744             CV_Assert(!f.fail());
745             f.close();
746             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
747             CV_Assert(f.is_open());
748             fileSize = getFileSize();
749         }
750         seekReadAbsolute(0);
751
752         // bypass FileHeader
753         uint32_t fileSourceSignatureSize = readUInt32();
754         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
755         seekReadRelative(fileSourceSignatureSize);
756
757         uint32_t numberOfEntries = readUInt32();
758         CV_Assert(numberOfEntries > 0);
759         if (numberOfEntries != MAX_ENTRIES)
760         {
761             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
762             clearFile();
763             return false;
764         }
765         size_t tableEntriesOffset = (size_t)f.tellg();
766         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
767         CV_Assert(!f.fail());
768
769         uint32_t entryNum = getHash(key);
770
771         uint32_t entryOffset = entryOffsets[entryNum];
772         FileEntry entry;
773         while (entryOffset > 0)
774         {
775             seekReadAbsolute(entryOffset);
776             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
777             f.read((char*)&entry, sizeof(entry));
778             CV_Assert(!f.fail());
779             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
780             if (key.size() == entry.keySize)
781             {
782                 if (entry.keySize > 0)
783                 {
784                     f.read(fileKey.data(), entry.keySize);
785                     CV_Assert(!f.fail());
786                 }
787                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
788                 {
789                     // duplicate
790                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
791                     return false;
792                 }
793             }
794             if (entry.nextEntryFileOffset == 0)
795                 break;
796             entryOffset = entry.nextEntryFileOffset;
797         }
798         seekReadAbsolute(0);
799         if (entryOffset > 0)
800         {
801             seekWriteAbsolute(entryOffset);
802             entry.nextEntryFileOffset = (uint32_t)fileSize;
803             f.write((char*)&entry, sizeof(entry));
804             CV_Assert(!f.fail());
805         }
806         else
807         {
808             entryOffsets[entryNum] = (uint32_t)fileSize;
809             seekWriteAbsolute(tableEntriesOffset);
810             f.write((char*)entryOffsets, sizeof(entryOffsets));
811             CV_Assert(!f.fail());
812         }
813         seekWriteAbsolute(fileSize);
814         entry.nextEntryFileOffset = 0;
815         entry.dataSize = (uint32_t)buf.size();
816         entry.keySize = (uint32_t)key.size();
817         f.write((char*)&entry, sizeof(entry));
818         CV_Assert(!f.fail());
819         f.write(key.c_str(), entry.keySize);
820         CV_Assert(!f.fail());
821         f.write(&buf[0], entry.dataSize);
822         CV_Assert(!f.fail());
823         f.flush();
824         CV_Assert(!f.fail());
825         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
826         return true;
827     }
828 };
829 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
830
831
832 // true if we have initialized OpenCL subsystem with available platforms
833 static bool g_isOpenCVActivated = false;
834
835 bool haveOpenCL()
836 {
837     CV_TRACE_FUNCTION();
838     static bool g_isOpenCLInitialized = false;
839     static bool g_isOpenCLAvailable = false;
840
841     if (!g_isOpenCLInitialized)
842     {
843         CV_TRACE_REGION("Init_OpenCL_Runtime");
844         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
845         if (envPath)
846         {
847             if (cv::String(envPath) == "disabled")
848             {
849                 g_isOpenCLAvailable = false;
850                 g_isOpenCLInitialized = true;
851             }
852         }
853         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
854         try
855         {
856             cl_uint n = 0;
857             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
858             g_isOpenCVActivated = n > 0;
859         }
860         catch (...)
861         {
862             g_isOpenCLAvailable = false;
863         }
864         g_isOpenCLInitialized = true;
865     }
866     return g_isOpenCLAvailable;
867 }
868
869 bool useOpenCL()
870 {
871     CoreTLSData& data = getCoreTlsData();
872     if (data.useOpenCL < 0)
873     {
874         try
875         {
876             data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
877         }
878         catch (...)
879         {
880             data.useOpenCL = 0;
881         }
882     }
883     return data.useOpenCL > 0;
884 }
885
886 bool isOpenCLActivated()
887 {
888     if (!g_isOpenCVActivated)
889         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
890     return useOpenCL();
891 }
892
893 void setUseOpenCL(bool flag)
894 {
895     CV_TRACE_FUNCTION();
896
897     CoreTLSData& data = getCoreTlsData();
898     if (!flag)
899     {
900         data.useOpenCL = 0;
901     }
902     else if( haveOpenCL() )
903     {
904         data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
905     }
906 }
907
908 #ifdef HAVE_CLAMDBLAS
909
910 class AmdBlasHelper
911 {
912 public:
913     static AmdBlasHelper & getInstance()
914     {
915         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
916     }
917
918     bool isAvailable() const
919     {
920         return g_isAmdBlasAvailable;
921     }
922
923     ~AmdBlasHelper()
924     {
925         try
926         {
927             clAmdBlasTeardown();
928         }
929         catch (...) { }
930     }
931
932 protected:
933     AmdBlasHelper()
934     {
935         if (!g_isAmdBlasInitialized)
936         {
937             AutoLock lock(getInitializationMutex());
938
939             if (!g_isAmdBlasInitialized)
940             {
941                 if (haveOpenCL())
942                 {
943                     try
944                     {
945                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
946                     }
947                     catch (...)
948                     {
949                         g_isAmdBlasAvailable = false;
950                     }
951                 }
952                 else
953                     g_isAmdBlasAvailable = false;
954
955                 g_isAmdBlasInitialized = true;
956             }
957         }
958     }
959
960 private:
961     static bool g_isAmdBlasInitialized;
962     static bool g_isAmdBlasAvailable;
963 };
964
965 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
966 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
967
968 bool haveAmdBlas()
969 {
970     return AmdBlasHelper::getInstance().isAvailable();
971 }
972
973 #else
974
975 bool haveAmdBlas()
976 {
977     return false;
978 }
979
980 #endif
981
982 #ifdef HAVE_CLAMDFFT
983
984 class AmdFftHelper
985 {
986 public:
987     static AmdFftHelper & getInstance()
988     {
989         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
990     }
991
992     bool isAvailable() const
993     {
994         return g_isAmdFftAvailable;
995     }
996
997     ~AmdFftHelper()
998     {
999         try
1000         {
1001 //            clAmdFftTeardown();
1002         }
1003         catch (...) { }
1004     }
1005
1006 protected:
1007     AmdFftHelper()
1008     {
1009         if (!g_isAmdFftInitialized)
1010         {
1011             AutoLock lock(getInitializationMutex());
1012
1013             if (!g_isAmdFftInitialized)
1014             {
1015                 if (haveOpenCL())
1016                 {
1017                     try
1018                     {
1019                         cl_uint major, minor, patch;
1020                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1021
1022                         // it throws exception in case AmdFft binaries are not found
1023                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1024                         g_isAmdFftAvailable = true;
1025                     }
1026                     catch (const Exception &)
1027                     {
1028                         g_isAmdFftAvailable = false;
1029                     }
1030                 }
1031                 else
1032                     g_isAmdFftAvailable = false;
1033
1034                 g_isAmdFftInitialized = true;
1035             }
1036         }
1037     }
1038
1039 private:
1040     static clAmdFftSetupData setupData;
1041     static bool g_isAmdFftInitialized;
1042     static bool g_isAmdFftAvailable;
1043 };
1044
1045 clAmdFftSetupData AmdFftHelper::setupData;
1046 bool AmdFftHelper::g_isAmdFftAvailable = false;
1047 bool AmdFftHelper::g_isAmdFftInitialized = false;
1048
1049 bool haveAmdFft()
1050 {
1051     return AmdFftHelper::getInstance().isAvailable();
1052 }
1053
1054 #else
1055
1056 bool haveAmdFft()
1057 {
1058     return false;
1059 }
1060
1061 #endif
1062
1063 bool haveSVM()
1064 {
1065 #ifdef HAVE_OPENCL_SVM
1066     return true;
1067 #else
1068     return false;
1069 #endif
1070 }
1071
1072 void finish()
1073 {
1074     Queue::getDefault().finish();
1075 }
1076
1077 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1078
1079 struct Platform::Impl
1080 {
1081     Impl()
1082     {
1083         refcount = 1;
1084         handle = 0;
1085         initialized = false;
1086     }
1087
1088     ~Impl() {}
1089
1090     void init()
1091     {
1092         if( !initialized )
1093         {
1094             //cl_uint num_entries
1095             cl_uint n = 0;
1096             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1097                 handle = 0;
1098             if( handle != 0 )
1099             {
1100                 char buf[1000];
1101                 size_t len = 0;
1102                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1103                 buf[len] = '\0';
1104                 vendor = String(buf);
1105             }
1106
1107             initialized = true;
1108         }
1109     }
1110
1111     IMPLEMENT_REFCOUNTABLE();
1112
1113     cl_platform_id handle;
1114     String vendor;
1115     bool initialized;
1116 };
1117
1118 Platform::Platform()
1119 {
1120     p = 0;
1121 }
1122
1123 Platform::~Platform()
1124 {
1125     if(p)
1126         p->release();
1127 }
1128
1129 Platform::Platform(const Platform& pl)
1130 {
1131     p = (Impl*)pl.p;
1132     if(p)
1133         p->addref();
1134 }
1135
1136 Platform& Platform::operator = (const Platform& pl)
1137 {
1138     Impl* newp = (Impl*)pl.p;
1139     if(newp)
1140         newp->addref();
1141     if(p)
1142         p->release();
1143     p = newp;
1144     return *this;
1145 }
1146
1147 void* Platform::ptr() const
1148 {
1149     return p ? p->handle : 0;
1150 }
1151
1152 Platform& Platform::getDefault()
1153 {
1154     static Platform p;
1155     if( !p.p )
1156     {
1157         p.p = new Impl;
1158         p.p->init();
1159     }
1160     return p;
1161 }
1162
1163 /////////////////////////////////////// Device ////////////////////////////////////////////
1164
1165 // deviceVersion has format
1166 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1167 // by specification
1168 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1169 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1170 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1171 {
1172     major = minor = 0;
1173     if (10 >= deviceVersion.length())
1174         return;
1175     const char *pstr = deviceVersion.c_str();
1176     if (0 != strncmp(pstr, "OpenCL ", 7))
1177         return;
1178     size_t ppos = deviceVersion.find('.', 7);
1179     if (String::npos == ppos)
1180         return;
1181     String temp = deviceVersion.substr(7, ppos - 7);
1182     major = atoi(temp.c_str());
1183     temp = deviceVersion.substr(ppos + 1);
1184     minor = atoi(temp.c_str());
1185 }
1186
1187 struct Device::Impl
1188 {
1189     Impl(void* d)
1190     {
1191         handle = (cl_device_id)d;
1192         refcount = 1;
1193
1194         name_ = getStrProp(CL_DEVICE_NAME);
1195         version_ = getStrProp(CL_DEVICE_VERSION);
1196         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1197         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1198         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1199         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1200         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1201         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1202         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1203         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1204
1205         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1206         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1207
1208         size_t pos = 0;
1209         while (pos < extensions_.size())
1210         {
1211             size_t pos2 = extensions_.find(' ', pos);
1212             if (pos2 == String::npos)
1213                 pos2 = extensions_.size();
1214             if (pos2 > pos)
1215             {
1216                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1217                 extensions_set_.insert(extensionName);
1218             }
1219             pos = pos2 + 1;
1220         }
1221
1222         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1223
1224         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1225         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1226             vendorName_ == "AMD")
1227             vendorID_ = VENDOR_AMD;
1228         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1229             vendorID_ = VENDOR_INTEL;
1230         else if (vendorName_ == "NVIDIA Corporation")
1231             vendorID_ = VENDOR_NVIDIA;
1232         else
1233             vendorID_ = UNKNOWN_VENDOR;
1234
1235         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1236         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1237         {
1238             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1239             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1240                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1241             maxWorkGroupSize_ = new_maxWorkGroupSize;
1242         }
1243 #if 0
1244         if (isExtensionSupported("cl_khr_spir"))
1245         {
1246 #ifndef CL_DEVICE_SPIR_VERSIONS
1247 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1248 #endif
1249             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1250             std::cout << spir_versions << std::endl;
1251         }
1252 #endif
1253     }
1254
1255     template<typename _TpCL, typename _TpOut>
1256     _TpOut getProp(cl_device_info prop) const
1257     {
1258         _TpCL temp=_TpCL();
1259         size_t sz = 0;
1260
1261         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1262             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1263     }
1264
1265     bool getBoolProp(cl_device_info prop) const
1266     {
1267         cl_bool temp = CL_FALSE;
1268         size_t sz = 0;
1269
1270         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1271             sz == sizeof(temp) ? temp != 0 : false;
1272     }
1273
1274     String getStrProp(cl_device_info prop) const
1275     {
1276         char buf[4096];
1277         size_t sz=0;
1278         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1279             sz < sizeof(buf) ? String(buf) : String();
1280     }
1281
1282     bool isExtensionSupported(const std::string& extensionName) const
1283     {
1284         return extensions_set_.count(extensionName) > 0;
1285     }
1286
1287
1288     IMPLEMENT_REFCOUNTABLE();
1289
1290     cl_device_id handle;
1291
1292     String name_;
1293     String version_;
1294     std::string extensions_;
1295     int doubleFPConfig_;
1296     bool hostUnifiedMemory_;
1297     int maxComputeUnits_;
1298     size_t maxWorkGroupSize_;
1299     int type_;
1300     int addressBits_;
1301     int deviceVersionMajor_;
1302     int deviceVersionMinor_;
1303     String driverVersion_;
1304     String vendorName_;
1305     int vendorID_;
1306     bool intelSubgroupsSupport_;
1307
1308     std::set<std::string> extensions_set_;
1309 };
1310
1311
1312 Device::Device()
1313 {
1314     p = 0;
1315 }
1316
1317 Device::Device(void* d)
1318 {
1319     p = 0;
1320     set(d);
1321 }
1322
1323 Device::Device(const Device& d)
1324 {
1325     p = d.p;
1326     if(p)
1327         p->addref();
1328 }
1329
1330 Device& Device::operator = (const Device& d)
1331 {
1332     Impl* newp = (Impl*)d.p;
1333     if(newp)
1334         newp->addref();
1335     if(p)
1336         p->release();
1337     p = newp;
1338     return *this;
1339 }
1340
1341 Device::~Device()
1342 {
1343     if(p)
1344         p->release();
1345 }
1346
1347 void Device::set(void* d)
1348 {
1349     if(p)
1350         p->release();
1351     p = new Impl(d);
1352 }
1353
1354 void* Device::ptr() const
1355 {
1356     return p ? p->handle : 0;
1357 }
1358
1359 String Device::name() const
1360 { return p ? p->name_ : String(); }
1361
1362 String Device::extensions() const
1363 { return p ? String(p->extensions_) : String(); }
1364
1365 bool Device::isExtensionSupported(const String& extensionName) const
1366 { return p ? p->isExtensionSupported(extensionName) : false; }
1367
1368 String Device::version() const
1369 { return p ? p->version_ : String(); }
1370
1371 String Device::vendorName() const
1372 { return p ? p->vendorName_ : String(); }
1373
1374 int Device::vendorID() const
1375 { return p ? p->vendorID_ : 0; }
1376
1377 String Device::OpenCL_C_Version() const
1378 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1379
1380 String Device::OpenCLVersion() const
1381 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1382
1383 int Device::deviceVersionMajor() const
1384 { return p ? p->deviceVersionMajor_ : 0; }
1385
1386 int Device::deviceVersionMinor() const
1387 { return p ? p->deviceVersionMinor_ : 0; }
1388
1389 String Device::driverVersion() const
1390 { return p ? p->driverVersion_ : String(); }
1391
1392 int Device::type() const
1393 { return p ? p->type_ : 0; }
1394
1395 int Device::addressBits() const
1396 { return p ? p->addressBits_ : 0; }
1397
1398 bool Device::available() const
1399 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1400
1401 bool Device::compilerAvailable() const
1402 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1403
1404 bool Device::linkerAvailable() const
1405 #ifdef CL_VERSION_1_2
1406 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1407 #else
1408 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1409 #endif
1410
1411 int Device::doubleFPConfig() const
1412 { return p ? p->doubleFPConfig_ : 0; }
1413
1414 int Device::singleFPConfig() const
1415 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1416
1417 int Device::halfFPConfig() const
1418 #ifdef CL_VERSION_1_2
1419 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1420 #else
1421 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1422 #endif
1423
1424 bool Device::endianLittle() const
1425 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1426
1427 bool Device::errorCorrectionSupport() const
1428 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1429
1430 int Device::executionCapabilities() const
1431 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1432
1433 size_t Device::globalMemCacheSize() const
1434 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1435
1436 int Device::globalMemCacheType() const
1437 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1438
1439 int Device::globalMemCacheLineSize() const
1440 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1441
1442 size_t Device::globalMemSize() const
1443 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1444
1445 size_t Device::localMemSize() const
1446 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1447
1448 int Device::localMemType() const
1449 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1450
1451 bool Device::hostUnifiedMemory() const
1452 { return p ? p->hostUnifiedMemory_ : false; }
1453
1454 bool Device::imageSupport() const
1455 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1456
1457 bool Device::imageFromBufferSupport() const
1458 {
1459     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1460 }
1461
1462 uint Device::imagePitchAlignment() const
1463 {
1464 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1465     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1466 #else
1467     return 0;
1468 #endif
1469 }
1470
1471 uint Device::imageBaseAddressAlignment() const
1472 {
1473 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1474     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1475 #else
1476     return 0;
1477 #endif
1478 }
1479
1480 size_t Device::image2DMaxWidth() const
1481 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1482
1483 size_t Device::image2DMaxHeight() const
1484 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1485
1486 size_t Device::image3DMaxWidth() const
1487 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1488
1489 size_t Device::image3DMaxHeight() const
1490 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1491
1492 size_t Device::image3DMaxDepth() const
1493 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1494
1495 size_t Device::imageMaxBufferSize() const
1496 #ifdef CL_VERSION_1_2
1497 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1498 #else
1499 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1500 #endif
1501
1502 size_t Device::imageMaxArraySize() const
1503 #ifdef CL_VERSION_1_2
1504 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1505 #else
1506 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1507 #endif
1508
1509 bool Device::intelSubgroupsSupport() const
1510 { return p ? p->intelSubgroupsSupport_ : false; }
1511
1512 int Device::maxClockFrequency() const
1513 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1514
1515 int Device::maxComputeUnits() const
1516 { return p ? p->maxComputeUnits_ : 0; }
1517
1518 int Device::maxConstantArgs() const
1519 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1520
1521 size_t Device::maxConstantBufferSize() const
1522 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1523
1524 size_t Device::maxMemAllocSize() const
1525 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1526
1527 size_t Device::maxParameterSize() const
1528 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1529
1530 int Device::maxReadImageArgs() const
1531 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1532
1533 int Device::maxWriteImageArgs() const
1534 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1535
1536 int Device::maxSamplers() const
1537 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1538
1539 size_t Device::maxWorkGroupSize() const
1540 { return p ? p->maxWorkGroupSize_ : 0; }
1541
1542 int Device::maxWorkItemDims() const
1543 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1544
1545 void Device::maxWorkItemSizes(size_t* sizes) const
1546 {
1547     if(p)
1548     {
1549         const int MAX_DIMS = 32;
1550         size_t retsz = 0;
1551         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1552                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1553     }
1554 }
1555
1556 int Device::memBaseAddrAlign() const
1557 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1558
1559 int Device::nativeVectorWidthChar() const
1560 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1561
1562 int Device::nativeVectorWidthShort() const
1563 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1564
1565 int Device::nativeVectorWidthInt() const
1566 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1567
1568 int Device::nativeVectorWidthLong() const
1569 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1570
1571 int Device::nativeVectorWidthFloat() const
1572 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1573
1574 int Device::nativeVectorWidthDouble() const
1575 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1576
1577 int Device::nativeVectorWidthHalf() const
1578 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1579
1580 int Device::preferredVectorWidthChar() const
1581 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1582
1583 int Device::preferredVectorWidthShort() const
1584 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1585
1586 int Device::preferredVectorWidthInt() const
1587 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1588
1589 int Device::preferredVectorWidthLong() const
1590 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1591
1592 int Device::preferredVectorWidthFloat() const
1593 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1594
1595 int Device::preferredVectorWidthDouble() const
1596 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1597
1598 int Device::preferredVectorWidthHalf() const
1599 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1600
1601 size_t Device::printfBufferSize() const
1602 #ifdef CL_VERSION_1_2
1603 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1604 #else
1605 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1606 #endif
1607
1608
1609 size_t Device::profilingTimerResolution() const
1610 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1611
1612 const Device& Device::getDefault()
1613 {
1614     const Context& ctx = Context::getDefault();
1615     int idx = getCoreTlsData().device;
1616     const Device& device = ctx.device(idx);
1617     return device;
1618 }
1619
1620 ////////////////////////////////////// Context ///////////////////////////////////////////////////
1621
1622 template <typename Functor, typename ObjectType>
1623 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
1624 {
1625     ::size_t required;
1626     cl_int err = f(obj, name, 0, NULL, &required);
1627     if (err != CL_SUCCESS)
1628         return err;
1629
1630     param.clear();
1631     if (required > 0)
1632     {
1633         AutoBuffer<char> buf(required + 1);
1634         char* ptr = buf.data(); // cleanup is not needed
1635         err = f(obj, name, required, ptr, NULL);
1636         if (err != CL_SUCCESS)
1637             return err;
1638         param = ptr;
1639     }
1640
1641     return CL_SUCCESS;
1642 }
1643
1644 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
1645 {
1646     elems.clear();
1647     if (s.size() == 0)
1648         return;
1649     std::istringstream ss(s);
1650     std::string item;
1651     while (!ss.eof())
1652     {
1653         std::getline(ss, item, delim);
1654         elems.push_back(item);
1655     }
1656 }
1657
1658 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
1659 // Sample: AMD:GPU:
1660 // Sample: AMD:GPU:Tahiti
1661 // Sample: :GPU|CPU: = '' = ':' = '::'
1662 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
1663         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
1664 {
1665     std::vector<std::string> parts;
1666     split(configurationStr, ':', parts);
1667     if (parts.size() > 3)
1668     {
1669         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
1670         return false;
1671     }
1672     if (parts.size() > 2)
1673         deviceNameOrID = parts[2];
1674     if (parts.size() > 1)
1675     {
1676         split(parts[1], '|', deviceTypes);
1677     }
1678     if (parts.size() > 0)
1679     {
1680         platform = parts[0];
1681     }
1682     return true;
1683 }
1684
1685 #if defined WINRT || defined _WIN32_WCE
1686 static cl_device_id selectOpenCLDevice()
1687 {
1688     return NULL;
1689 }
1690 #else
1691 // std::tolower is int->int
1692 static char char_tolower(char ch)
1693 {
1694     return (char)std::tolower((int)ch);
1695 }
1696 static cl_device_id selectOpenCLDevice()
1697 {
1698     std::string platform, deviceName;
1699     std::vector<std::string> deviceTypes;
1700
1701     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
1702     if (configuration &&
1703             (strcmp(configuration, "disabled") == 0 ||
1704              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
1705             ))
1706         return NULL;
1707
1708     bool isID = false;
1709     int deviceID = -1;
1710     if (deviceName.length() == 1)
1711     // We limit ID range to 0..9, because we want to write:
1712     // - '2500' to mean i5-2500
1713     // - '8350' to mean AMD FX-8350
1714     // - '650' to mean GeForce 650
1715     // To extend ID range change condition to '> 0'
1716     {
1717         isID = true;
1718         for (size_t i = 0; i < deviceName.length(); i++)
1719         {
1720             if (!isdigit(deviceName[i]))
1721             {
1722                 isID = false;
1723                 break;
1724             }
1725         }
1726         if (isID)
1727         {
1728             deviceID = atoi(deviceName.c_str());
1729             if (deviceID < 0)
1730                 return NULL;
1731         }
1732     }
1733
1734     std::vector<cl_platform_id> platforms;
1735     {
1736         cl_uint numPlatforms = 0;
1737         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
1738
1739         if (numPlatforms == 0)
1740             return NULL;
1741         platforms.resize((size_t)numPlatforms);
1742         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
1743         platforms.resize(numPlatforms);
1744     }
1745
1746     int selectedPlatform = -1;
1747     if (platform.length() > 0)
1748     {
1749         for (size_t i = 0; i < platforms.size(); i++)
1750         {
1751             std::string name;
1752             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
1753             if (name.find(platform) != std::string::npos)
1754             {
1755                 selectedPlatform = (int)i;
1756                 break;
1757             }
1758         }
1759         if (selectedPlatform == -1)
1760         {
1761             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
1762             goto not_found;
1763         }
1764     }
1765     if (deviceTypes.size() == 0)
1766     {
1767         if (!isID)
1768         {
1769             deviceTypes.push_back("GPU");
1770             if (configuration)
1771                 deviceTypes.push_back("CPU");
1772         }
1773         else
1774             deviceTypes.push_back("ALL");
1775     }
1776     for (size_t t = 0; t < deviceTypes.size(); t++)
1777     {
1778         int deviceType = 0;
1779         std::string tempStrDeviceType = deviceTypes[t];
1780         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
1781
1782         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1783             deviceType = Device::TYPE_GPU;
1784         else if (tempStrDeviceType == "cpu")
1785             deviceType = Device::TYPE_CPU;
1786         else if (tempStrDeviceType == "accelerator")
1787             deviceType = Device::TYPE_ACCELERATOR;
1788         else if (tempStrDeviceType == "all")
1789             deviceType = Device::TYPE_ALL;
1790         else
1791         {
1792             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
1793             goto not_found;
1794         }
1795
1796         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
1797         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
1798                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
1799                 i++)
1800         {
1801             cl_uint count = 0;
1802             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
1803             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1804             {
1805                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
1806             }
1807             if (count == 0)
1808                 continue;
1809             size_t base = devices.size();
1810             devices.resize(base + count);
1811             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
1812             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
1813             {
1814                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
1815             }
1816         }
1817
1818         for (size_t i = (isID ? deviceID : 0);
1819              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
1820              i++)
1821         {
1822             std::string name;
1823             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
1824             cl_bool useGPU = true;
1825             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
1826             {
1827                 cl_bool isIGPU = CL_FALSE;
1828                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
1829                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
1830             }
1831             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
1832             {
1833                 // TODO check for OpenCL 1.1
1834                 return devices[i];
1835             }
1836         }
1837     }
1838
1839 not_found:
1840     if (!configuration)
1841         return NULL; // suppress messages on stderr
1842
1843     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
1844             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
1845             << "    Device types: ";
1846     for (size_t t = 0; t < deviceTypes.size(); t++)
1847         std::cerr << deviceTypes[t] << " ";
1848
1849     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
1850     return NULL;
1851 }
1852 #endif
1853
1854 #ifdef HAVE_OPENCL_SVM
1855 namespace svm {
1856
1857 enum AllocatorFlags { // don't use first 16 bits
1858         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
1859         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
1860         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
1861         OPENCL_SVM_BUFFER_MASK = 3 << 16,
1862         OPENCL_SVM_BUFFER_MAP = 4 << 16
1863 };
1864
1865 static bool checkForceSVMUmatUsage()
1866 {
1867     static bool initialized = false;
1868     static bool force = false;
1869     if (!initialized)
1870     {
1871         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
1872         initialized = true;
1873     }
1874     return force;
1875 }
1876 static bool checkDisableSVMUMatUsage()
1877 {
1878     static bool initialized = false;
1879     static bool force = false;
1880     if (!initialized)
1881     {
1882         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
1883         initialized = true;
1884     }
1885     return force;
1886 }
1887 static bool checkDisableSVM()
1888 {
1889     static bool initialized = false;
1890     static bool force = false;
1891     if (!initialized)
1892     {
1893         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
1894         initialized = true;
1895     }
1896     return force;
1897 }
1898 // see SVMCapabilities
1899 static unsigned int getSVMCapabilitiesMask()
1900 {
1901     static bool initialized = false;
1902     static unsigned int mask = 0;
1903     if (!initialized)
1904     {
1905         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
1906         if (envValue == NULL)
1907         {
1908             return ~0U; // all bits 1
1909         }
1910         mask = atoi(envValue);
1911         initialized = true;
1912     }
1913     return mask;
1914 }
1915 } // namespace
1916 #endif
1917
1918 static size_t getProgramCountLimit()
1919 {
1920     static bool initialized = false;
1921     static size_t count = 0;
1922     if (!initialized)
1923     {
1924         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
1925         initialized = true;
1926     }
1927     return count;
1928 }
1929
1930 struct Context::Impl
1931 {
1932     static Context::Impl* get(Context& context) { return context.p; }
1933
1934     void __init()
1935     {
1936         refcount = 1;
1937         handle = 0;
1938 #ifdef HAVE_OPENCL_SVM
1939         svmInitialized = false;
1940 #endif
1941     }
1942
1943     Impl()
1944     {
1945         __init();
1946     }
1947
1948     void setDefault()
1949     {
1950         CV_Assert(handle == NULL);
1951
1952         cl_device_id d = selectOpenCLDevice();
1953
1954         if (d == NULL)
1955             return;
1956
1957         cl_platform_id pl = NULL;
1958         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
1959
1960         cl_context_properties prop[] =
1961         {
1962             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1963             0
1964         };
1965
1966         // !!! in the current implementation force the number of devices to 1 !!!
1967         cl_uint nd = 1;
1968         cl_int status;
1969
1970         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
1971         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
1972
1973         bool ok = handle != 0 && status == CL_SUCCESS;
1974         if( ok )
1975         {
1976             devices.resize(nd);
1977             devices[0].set(d);
1978         }
1979         else
1980             handle = NULL;
1981     }
1982
1983     Impl(int dtype0)
1984     {
1985         __init();
1986
1987         cl_int retval = 0;
1988         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
1989         cl_context_properties prop[] =
1990         {
1991             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
1992             0
1993         };
1994
1995         cl_uint nd0 = 0;
1996         int dtype = dtype0 & 15;
1997         cl_int status = clGetDeviceIDs(pl, dtype, 0, NULL, &nd0);
1998         if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
1999         {
2000             CV_OCL_DBG_CHECK_RESULT(status,
2001                 cv::format("clGetDeviceIDs(platform=%p, device_type=%d, num_entries=0, devices=NULL, numDevices=%p)", pl, dtype, &nd0).c_str());
2002         }
2003
2004         if (nd0 == 0)
2005             return;
2006
2007         AutoBuffer<void*> dlistbuf(nd0*2+1);
2008         cl_device_id* dlist = (cl_device_id*)dlistbuf.data();
2009         cl_device_id* dlist_new = dlist + nd0;
2010         CV_OCL_DBG_CHECK(clGetDeviceIDs(pl, dtype, nd0, dlist, &nd0));
2011
2012         cl_uint i, nd = 0;
2013         String name0;
2014         for(i = 0; i < nd0; i++)
2015         {
2016             Device d(dlist[i]);
2017             if( !d.available() || !d.compilerAvailable() )
2018                 continue;
2019             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2020                 continue;
2021             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2022                 continue;
2023             String name = d.name();
2024             if( nd != 0 && name != name0 )
2025                 continue;
2026             name0 = name;
2027             dlist_new[nd++] = dlist[i];
2028         }
2029
2030         if(nd == 0)
2031             return;
2032
2033         // !!! in the current implementation force the number of devices to 1 !!!
2034         nd = 1;
2035
2036         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2037         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateContext");
2038         bool ok = handle != 0 && retval == CL_SUCCESS;
2039         if( ok )
2040         {
2041             devices.resize(nd);
2042             for( i = 0; i < nd; i++ )
2043                 devices[i].set(dlist_new[i]);
2044         }
2045     }
2046
2047     ~Impl()
2048     {
2049         if(handle)
2050         {
2051             CV_OCL_DBG_CHECK(clReleaseContext(handle));
2052             handle = NULL;
2053         }
2054         devices.clear();
2055     }
2056
2057     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2058
2059     void unloadProg(Program& prog)
2060     {
2061         cv::AutoLock lock(program_cache_mutex);
2062         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2063         {
2064               phash_t::iterator it = phash.find(*i);
2065               if (it != phash.end())
2066               {
2067                   if (it->second.ptr() == prog.ptr())
2068                   {
2069                       phash.erase(*i);
2070                       cacheList.erase(i);
2071                       return;
2072                   }
2073               }
2074         }
2075     }
2076
2077     std::string& getPrefixString()
2078     {
2079         if (prefix.empty())
2080         {
2081             cv::AutoLock lock(program_cache_mutex);
2082             if (prefix.empty())
2083             {
2084                 CV_Assert(!devices.empty());
2085                 const Device& d = devices[0];
2086                 int bits = d.addressBits();
2087                 if (bits > 0 && bits != 64)
2088                     prefix = cv::format("%d-bit--", bits);
2089                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2090                 // sanitize chars
2091                 for (size_t i = 0; i < prefix.size(); i++)
2092                 {
2093                     char c = prefix[i];
2094                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2095                     {
2096                         prefix[i] = '_';
2097                     }
2098                 }
2099             }
2100         }
2101         return prefix;
2102     }
2103
2104     std::string& getPrefixBase()
2105     {
2106         if (prefix_base.empty())
2107         {
2108             cv::AutoLock lock(program_cache_mutex);
2109             if (prefix_base.empty())
2110             {
2111                 const Device& d = devices[0];
2112                 int bits = d.addressBits();
2113                 if (bits > 0 && bits != 64)
2114                     prefix_base = cv::format("%d-bit--", bits);
2115                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2116                 // sanitize chars
2117                 for (size_t i = 0; i < prefix_base.size(); i++)
2118                 {
2119                     char c = prefix_base[i];
2120                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2121                     {
2122                         prefix_base[i] = '_';
2123                     }
2124                 }
2125             }
2126         }
2127         return prefix_base;
2128     }
2129
2130     IMPLEMENT_REFCOUNTABLE();
2131
2132     cl_context handle;
2133     std::vector<Device> devices;
2134
2135     std::string prefix;
2136     std::string prefix_base;
2137
2138     cv::Mutex program_cache_mutex;
2139     typedef std::map<std::string, Program> phash_t;
2140     phash_t phash;
2141     typedef std::list<cv::String> CacheList;
2142     CacheList cacheList;
2143
2144 #ifdef HAVE_OPENCL_SVM
2145     bool svmInitialized;
2146     bool svmAvailable;
2147     bool svmEnabled;
2148     svm::SVMCapabilities svmCapabilities;
2149     svm::SVMFunctions svmFunctions;
2150
2151     void svmInit()
2152     {
2153         CV_Assert(handle != NULL);
2154         const Device& device = devices[0];
2155         cl_device_svm_capabilities deviceCaps = 0;
2156         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2157         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2158         if (status != CL_SUCCESS)
2159         {
2160             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2161             goto noSVM;
2162         }
2163         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2164         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2165         svmCapabilities.value_ =
2166                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2167                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2168                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2169                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2170         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2171         if (svmCapabilities.value_ == 0)
2172         {
2173             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2174             goto noSVM;
2175         }
2176         try
2177         {
2178             // Try OpenCL 2.0
2179             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2180             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2181             if (!ptr)
2182             {
2183                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2184                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2185             }
2186             try
2187             {
2188                 bool error = false;
2189                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2190                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2191                 {
2192                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2193                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2194                 }
2195                 clFinish(q);
2196                 try
2197                 {
2198                     ((int*)ptr)[0] = 100;
2199                 }
2200                 catch (...)
2201                 {
2202                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2203                     error = true;
2204                 }
2205                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2206                 {
2207                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2208                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2209                 }
2210                 clFinish(q);
2211                 if (error)
2212                 {
2213                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2214                 }
2215             }
2216             catch (...)
2217             {
2218                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2219                 clSVMFree(handle, ptr);
2220                 throw;
2221             }
2222             clSVMFree(handle, ptr);
2223             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2224             svmFunctions.fn_clSVMFree = clSVMFree;
2225             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2226             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2227             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2228             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2229             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2230             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2231             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2232         }
2233         catch (...)
2234         {
2235             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2236             try
2237             {
2238                 // Try HSA extension
2239                 String extensions = device.extensions();
2240                 if (extensions.find("cl_amd_svm") == String::npos)
2241                 {
2242                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2243                     goto noSVM;
2244                 }
2245                 cl_platform_id p = NULL;
2246                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2247                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2248                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2249                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2250                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2251                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2252                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2253                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2254                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2255                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2256                 CV_Assert(svmFunctions.isValid());
2257             }
2258             catch (...)
2259             {
2260                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2261                 goto noSVM;
2262             }
2263         }
2264
2265         svmAvailable = true;
2266         svmEnabled = !svm::checkDisableSVM();
2267         svmInitialized = true;
2268         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2269         return;
2270     noSVM:
2271         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2272         svmAvailable = false;
2273         svmEnabled = false;
2274         svmCapabilities.value_ = 0;
2275         svmInitialized = true;
2276         svmFunctions.fn_clSVMAlloc = NULL;
2277         return;
2278     }
2279 #endif
2280
2281     friend class Program;
2282 };
2283
2284
2285 Context::Context()
2286 {
2287     p = 0;
2288 }
2289
2290 Context::Context(int dtype)
2291 {
2292     p = 0;
2293     create(dtype);
2294 }
2295
2296 bool Context::create()
2297 {
2298     if( !haveOpenCL() )
2299         return false;
2300     if(p)
2301         p->release();
2302     p = new Impl();
2303     if(!p->handle)
2304     {
2305         delete p;
2306         p = 0;
2307     }
2308     return p != 0;
2309 }
2310
2311 bool Context::create(int dtype0)
2312 {
2313     if( !haveOpenCL() )
2314         return false;
2315     if(p)
2316         p->release();
2317     p = new Impl(dtype0);
2318     if(!p->handle)
2319     {
2320         delete p;
2321         p = 0;
2322     }
2323     return p != 0;
2324 }
2325
2326 Context::~Context()
2327 {
2328     if (p)
2329     {
2330         p->release();
2331         p = NULL;
2332     }
2333 }
2334
2335 Context::Context(const Context& c)
2336 {
2337     p = (Impl*)c.p;
2338     if(p)
2339         p->addref();
2340 }
2341
2342 Context& Context::operator = (const Context& c)
2343 {
2344     Impl* newp = (Impl*)c.p;
2345     if(newp)
2346         newp->addref();
2347     if(p)
2348         p->release();
2349     p = newp;
2350     return *this;
2351 }
2352
2353 void* Context::ptr() const
2354 {
2355     return p == NULL ? NULL : p->handle;
2356 }
2357
2358 size_t Context::ndevices() const
2359 {
2360     return p ? p->devices.size() : 0;
2361 }
2362
2363 const Device& Context::device(size_t idx) const
2364 {
2365     static Device dummy;
2366     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2367 }
2368
2369 Context& Context::getDefault(bool initialize)
2370 {
2371     static Context* ctx = new Context();
2372     if(!ctx->p && haveOpenCL())
2373     {
2374         if (!ctx->p)
2375             ctx->p = new Impl();
2376         if (initialize)
2377         {
2378             // do not create new Context right away.
2379             // First, try to retrieve existing context of the same type.
2380             // In its turn, Platform::getContext() may call Context::create()
2381             // if there is no such context.
2382             if (ctx->p->handle == NULL)
2383                 ctx->p->setDefault();
2384         }
2385     }
2386
2387     return *ctx;
2388 }
2389
2390 Program Context::getProg(const ProgramSource& prog,
2391                          const String& buildopts, String& errmsg)
2392 {
2393     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2394 }
2395
2396 void Context::unloadProg(Program& prog)
2397 {
2398     if (p)
2399         p->unloadProg(prog);
2400 }
2401
2402 #ifdef HAVE_OPENCL_SVM
2403 bool Context::useSVM() const
2404 {
2405     Context::Impl* i = p;
2406     CV_Assert(i);
2407     if (!i->svmInitialized)
2408         i->svmInit();
2409     return i->svmEnabled;
2410 }
2411 void Context::setUseSVM(bool enabled)
2412 {
2413     Context::Impl* i = p;
2414     CV_Assert(i);
2415     if (!i->svmInitialized)
2416         i->svmInit();
2417     if (enabled && !i->svmAvailable)
2418     {
2419         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
2420     }
2421     i->svmEnabled = enabled;
2422 }
2423 #else
2424 bool Context::useSVM() const { return false; }
2425 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
2426 #endif
2427
2428 #ifdef HAVE_OPENCL_SVM
2429 namespace svm {
2430
2431 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
2432 {
2433     Context::Impl* i = context.p;
2434     CV_Assert(i);
2435     if (!i->svmInitialized)
2436         i->svmInit();
2437     return i->svmCapabilities;
2438 }
2439
2440 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
2441 {
2442     Context::Impl* i = context.p;
2443     CV_Assert(i);
2444     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
2445     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
2446     return &i->svmFunctions;
2447 }
2448
2449 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
2450 {
2451     if (checkForceSVMUmatUsage())
2452         return true;
2453     if (checkDisableSVMUMatUsage())
2454         return false;
2455     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
2456         return true;
2457     return false; // don't use SVM by default
2458 }
2459
2460 } // namespace cv::ocl::svm
2461 #endif // HAVE_OPENCL_SVM
2462
2463
2464 static void get_platform_name(cl_platform_id id, String& name)
2465 {
2466     // get platform name string length
2467     size_t sz = 0;
2468     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
2469
2470     // get platform name string
2471     AutoBuffer<char> buf(sz + 1);
2472     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
2473
2474     // just in case, ensure trailing zero for ASCIIZ string
2475     buf[sz] = 0;
2476
2477     name = buf.data();
2478 }
2479
2480 /*
2481 // Attaches OpenCL context to OpenCV
2482 */
2483 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
2484 {
2485     cl_uint cnt = 0;
2486
2487     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
2488
2489     if (cnt == 0)
2490         CV_Error(cv::Error::OpenCLApiCallError, "no OpenCL platform available!");
2491
2492     std::vector<cl_platform_id> platforms(cnt);
2493
2494     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
2495
2496     bool platformAvailable = false;
2497
2498     // check if external platformName contained in list of available platforms in OpenCV
2499     for (unsigned int i = 0; i < cnt; i++)
2500     {
2501         String availablePlatformName;
2502         get_platform_name(platforms[i], availablePlatformName);
2503         // external platform is found in the list of available platforms
2504         if (platformName == availablePlatformName)
2505         {
2506             platformAvailable = true;
2507             break;
2508         }
2509     }
2510
2511     if (!platformAvailable)
2512         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2513
2514     // check if platformID corresponds to platformName
2515     String actualPlatformName;
2516     get_platform_name((cl_platform_id)platformID, actualPlatformName);
2517     if (platformName != actualPlatformName)
2518         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
2519
2520     // do not initialize OpenCL context
2521     Context ctx = Context::getDefault(false);
2522
2523     // attach supplied context to OpenCV
2524     initializeContextFromHandle(ctx, platformID, context, deviceID);
2525
2526     CV_OCL_CHECK(clRetainContext((cl_context)context));
2527
2528     // clear command queue, if any
2529     CoreTLSData& data = getCoreTlsData();
2530     data.oclQueue.finish();
2531     Queue q;
2532     data.oclQueue = q;
2533
2534     return;
2535 } // attachContext()
2536
2537
2538 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2539 {
2540     cl_context context = (cl_context)_context;
2541     cl_device_id device = (cl_device_id)_device;
2542
2543     // cleanup old context
2544     Context::Impl * impl = ctx.p;
2545     if (impl->handle)
2546     {
2547         CV_OCL_DBG_CHECK(clReleaseContext(impl->handle));
2548     }
2549     impl->devices.clear();
2550
2551     impl->handle = context;
2552     impl->devices.resize(1);
2553     impl->devices[0].set(device);
2554
2555     Platform& p = Platform::getDefault();
2556     Platform::Impl* pImpl = p.p;
2557     pImpl->handle = (cl_platform_id)platform;
2558 }
2559
2560 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2561
2562 struct Queue::Impl
2563 {
2564     inline void __init()
2565     {
2566         refcount = 1;
2567         handle = 0;
2568         isProfilingQueue_ = false;
2569     }
2570
2571     Impl(cl_command_queue q)
2572     {
2573         __init();
2574         handle = q;
2575
2576         cl_command_queue_properties props = 0;
2577         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
2578         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
2579     }
2580
2581     Impl(cl_command_queue q, bool isProfilingQueue)
2582     {
2583         __init();
2584         handle = q;
2585         isProfilingQueue_ = isProfilingQueue;
2586     }
2587
2588     Impl(const Context& c, const Device& d, bool withProfiling = false)
2589     {
2590         __init();
2591
2592         const Context* pc = &c;
2593         cl_context ch = (cl_context)pc->ptr();
2594         if( !ch )
2595         {
2596             pc = &Context::getDefault();
2597             ch = (cl_context)pc->ptr();
2598         }
2599         cl_device_id dh = (cl_device_id)d.ptr();
2600         if( !dh )
2601             dh = (cl_device_id)pc->device(0).ptr();
2602         cl_int retval = 0;
2603         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
2604         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
2605         isProfilingQueue_ = withProfiling;
2606     }
2607
2608     ~Impl()
2609     {
2610 #ifdef _WIN32
2611         if (!cv::__termination)
2612 #endif
2613         {
2614             if(handle)
2615             {
2616                 CV_OCL_DBG_CHECK(clFinish(handle));
2617                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
2618                 handle = NULL;
2619             }
2620         }
2621     }
2622
2623     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
2624     {
2625         if (isProfilingQueue_)
2626             return self;
2627
2628         if (profiling_queue_.ptr())
2629             return profiling_queue_;
2630
2631         cl_context ctx = 0;
2632         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
2633
2634         cl_device_id device = 0;
2635         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
2636
2637         cl_int result = CL_SUCCESS;
2638         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
2639         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
2640         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
2641
2642         Queue queue;
2643         queue.p = new Impl(q, true);
2644         profiling_queue_ = queue;
2645
2646         return profiling_queue_;
2647     }
2648
2649     IMPLEMENT_REFCOUNTABLE();
2650
2651     cl_command_queue handle;
2652     bool isProfilingQueue_;
2653     cv::ocl::Queue profiling_queue_;
2654 };
2655
2656 Queue::Queue()
2657 {
2658     p = 0;
2659 }
2660
2661 Queue::Queue(const Context& c, const Device& d)
2662 {
2663     p = 0;
2664     create(c, d);
2665 }
2666
2667 Queue::Queue(const Queue& q)
2668 {
2669     p = q.p;
2670     if(p)
2671         p->addref();
2672 }
2673
2674 Queue& Queue::operator = (const Queue& q)
2675 {
2676     Impl* newp = (Impl*)q.p;
2677     if(newp)
2678         newp->addref();
2679     if(p)
2680         p->release();
2681     p = newp;
2682     return *this;
2683 }
2684
2685 Queue::~Queue()
2686 {
2687     if(p)
2688         p->release();
2689 }
2690
2691 bool Queue::create(const Context& c, const Device& d)
2692 {
2693     if(p)
2694         p->release();
2695     p = new Impl(c, d);
2696     return p->handle != 0;
2697 }
2698
2699 void Queue::finish()
2700 {
2701     if(p && p->handle)
2702     {
2703         CV_OCL_DBG_CHECK(clFinish(p->handle));
2704     }
2705 }
2706
2707 const Queue& Queue::getProfilingQueue() const
2708 {
2709     CV_Assert(p);
2710     return p->getProfilingQueue(*this);
2711 }
2712
2713 void* Queue::ptr() const
2714 {
2715     return p ? p->handle : 0;
2716 }
2717
2718 Queue& Queue::getDefault()
2719 {
2720     Queue& q = getCoreTlsData().oclQueue;
2721     if( !q.p && haveOpenCL() )
2722         q.create(Context::getDefault());
2723     return q;
2724 }
2725
2726 static cl_command_queue getQueue(const Queue& q)
2727 {
2728     cl_command_queue qq = (cl_command_queue)q.ptr();
2729     if(!qq)
2730         qq = (cl_command_queue)Queue::getDefault().ptr();
2731     return qq;
2732 }
2733
2734 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2735
2736 KernelArg::KernelArg()
2737     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2738 {
2739 }
2740
2741 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2742     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2743 {
2744     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
2745 }
2746
2747 KernelArg KernelArg::Constant(const Mat& m)
2748 {
2749     CV_Assert(m.isContinuous());
2750     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
2751 }
2752
2753 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2754
2755 struct Kernel::Impl
2756 {
2757     Impl(const char* kname, const Program& prog) :
2758         refcount(1), handle(NULL), isInProgress(false), nu(0)
2759     {
2760         cl_program ph = (cl_program)prog.ptr();
2761         cl_int retval = 0;
2762         name = kname;
2763         if (ph)
2764         {
2765             handle = clCreateKernel(ph, kname, &retval);
2766             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
2767         }
2768         for( int i = 0; i < MAX_ARRS; i++ )
2769             u[i] = 0;
2770         haveTempDstUMats = false;
2771         haveTempSrcUMats = false;
2772     }
2773
2774     void cleanupUMats()
2775     {
2776         for( int i = 0; i < MAX_ARRS; i++ )
2777             if( u[i] )
2778             {
2779                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2780                 {
2781                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
2782                     u[i]->currAllocator->deallocate(u[i]);
2783                 }
2784                 u[i] = 0;
2785             }
2786         nu = 0;
2787         haveTempDstUMats = false;
2788         haveTempSrcUMats = false;
2789     }
2790
2791     void addUMat(const UMat& m, bool dst)
2792     {
2793         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2794         u[nu] = m.u;
2795         CV_XADD(&m.u->urefcount, 1);
2796         nu++;
2797         if(dst && m.u->tempUMat())
2798             haveTempDstUMats = true;
2799         if(m.u->originalUMatData == NULL && m.u->tempUMat())
2800             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
2801     }
2802
2803     void addImage(const Image2D& image)
2804     {
2805         images.push_back(image);
2806     }
2807
2808     void finit(cl_event e)
2809     {
2810         CV_UNUSED(e);
2811         cleanupUMats();
2812         images.clear();
2813         isInProgress = false;
2814         release();
2815     }
2816
2817     bool run(int dims, size_t _globalsize[], size_t _localsize[],
2818             bool sync, int64* timeNS, const Queue& q);
2819
2820     ~Impl()
2821     {
2822         if(handle)
2823         {
2824             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
2825         }
2826     }
2827
2828     IMPLEMENT_REFCOUNTABLE();
2829
2830     cv::String name;
2831     cl_kernel handle;
2832     enum { MAX_ARRS = 16 };
2833     UMatData* u[MAX_ARRS];
2834     bool isInProgress;
2835     int nu;
2836     std::list<Image2D> images;
2837     bool haveTempDstUMats;
2838     bool haveTempSrcUMats;
2839 };
2840
2841 }} // namespace cv::ocl
2842
2843 extern "C" {
2844
2845 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
2846 {
2847     try
2848     {
2849         ((cv::ocl::Kernel::Impl*)p)->finit(e);
2850     }
2851     catch (const cv::Exception& exc)
2852     {
2853         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
2854     }
2855     catch (const std::exception& exc)
2856     {
2857         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
2858     }
2859     catch (...)
2860     {
2861         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
2862     }
2863 }
2864
2865 }
2866
2867 namespace cv { namespace ocl {
2868
2869 Kernel::Kernel()
2870 {
2871     p = 0;
2872 }
2873
2874 Kernel::Kernel(const char* kname, const Program& prog)
2875 {
2876     p = 0;
2877     create(kname, prog);
2878 }
2879
2880 Kernel::Kernel(const char* kname, const ProgramSource& src,
2881                const String& buildopts, String* errmsg)
2882 {
2883     p = 0;
2884     create(kname, src, buildopts, errmsg);
2885 }
2886
2887 Kernel::Kernel(const Kernel& k)
2888 {
2889     p = k.p;
2890     if(p)
2891         p->addref();
2892 }
2893
2894 Kernel& Kernel::operator = (const Kernel& k)
2895 {
2896     Impl* newp = (Impl*)k.p;
2897     if(newp)
2898         newp->addref();
2899     if(p)
2900         p->release();
2901     p = newp;
2902     return *this;
2903 }
2904
2905 Kernel::~Kernel()
2906 {
2907     if(p)
2908         p->release();
2909 }
2910
2911 bool Kernel::create(const char* kname, const Program& prog)
2912 {
2913     if(p)
2914         p->release();
2915     p = new Impl(kname, prog);
2916     if(p->handle == 0)
2917     {
2918         p->release();
2919         p = 0;
2920     }
2921 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
2922     CV_Assert(p);
2923 #endif
2924     return p != 0;
2925 }
2926
2927 bool Kernel::create(const char* kname, const ProgramSource& src,
2928                     const String& buildopts, String* errmsg)
2929 {
2930     if(p)
2931     {
2932         p->release();
2933         p = 0;
2934     }
2935     String tempmsg;
2936     if( !errmsg ) errmsg = &tempmsg;
2937     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2938     return create(kname, prog);
2939 }
2940
2941 void* Kernel::ptr() const
2942 {
2943     return p ? p->handle : 0;
2944 }
2945
2946 bool Kernel::empty() const
2947 {
2948     return ptr() == 0;
2949 }
2950
2951 int Kernel::set(int i, const void* value, size_t sz)
2952 {
2953     if (!p || !p->handle)
2954         return -1;
2955     if (i < 0)
2956         return i;
2957     if( i == 0 )
2958         p->cleanupUMats();
2959
2960     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2961     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%p)", p->name.c_str(), (int)i, (int)sz, (void*)value).c_str());
2962     if (retval != CL_SUCCESS)
2963         return -1;
2964     return i+1;
2965 }
2966
2967 int Kernel::set(int i, const Image2D& image2D)
2968 {
2969     p->addImage(image2D);
2970     cl_mem h = (cl_mem)image2D.ptr();
2971     return set(i, &h, sizeof(h));
2972 }
2973
2974 int Kernel::set(int i, const UMat& m)
2975 {
2976     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
2977 }
2978
2979 int Kernel::set(int i, const KernelArg& arg)
2980 {
2981     if( !p || !p->handle )
2982         return -1;
2983     if (i < 0)
2984     {
2985         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
2986                 p->name.c_str(), (int)i));
2987         return i;
2988     }
2989     if( i == 0 )
2990         p->cleanupUMats();
2991     cl_int status = 0;
2992     if( arg.m )
2993     {
2994         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
2995                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
2996         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
2997         if (ptronly && arg.m->empty())
2998         {
2999             cl_mem h_null = (cl_mem)NULL;
3000             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3001             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3002             return i + 1;
3003         }
3004         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3005
3006         if (!h)
3007         {
3008             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3009                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3010             p->release();
3011             p = 0;
3012             return -1;
3013         }
3014
3015 #ifdef HAVE_OPENCL_SVM
3016         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3017         {
3018             const Context& ctx = Context::getDefault();
3019             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3020             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3021             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3022 #if 1 // TODO
3023             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3024 #else
3025             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3026 #endif
3027             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3028         }
3029         else
3030 #endif
3031         {
3032             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3033             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3034         }
3035
3036         if (ptronly)
3037         {
3038             i++;
3039         }
3040         else if( arg.m->dims <= 2 )
3041         {
3042             UMat2D u2d(*arg.m);
3043             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3044             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3045             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3046             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3047             i += 3;
3048
3049             if( !(arg.flags & KernelArg::NO_SIZE) )
3050             {
3051                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3052                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3053                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3054                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3055                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3056                 i += 2;
3057             }
3058         }
3059         else
3060         {
3061             UMat3D u3d(*arg.m);
3062             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3063             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3064             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3065             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3066             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3067             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3068             i += 4;
3069             if( !(arg.flags & KernelArg::NO_SIZE) )
3070             {
3071                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3072                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3073                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3074                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3075                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3076                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3077                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3078                 i += 3;
3079             }
3080         }
3081         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3082         return i;
3083     }
3084     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3085     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3086     return i+1;
3087 }
3088
3089 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3090                  bool sync, const Queue& q)
3091 {
3092     if (!p)
3093         return false;
3094
3095     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3096     size_t total = 1;
3097     CV_Assert(_globalsize != NULL);
3098     for (int i = 0; i < dims; i++)
3099     {
3100         size_t val = _localsize ? _localsize[i] :
3101             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3102         CV_Assert( val > 0 );
3103         total *= _globalsize[i];
3104         if (_globalsize[i] == 1 && !_localsize)
3105             val = 1;
3106         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3107     }
3108     CV_Assert(total > 0);
3109
3110     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3111 }
3112
3113
3114 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3115         bool sync, int64* timeNS, const Queue& q)
3116 {
3117     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3118
3119     if (!handle || isInProgress)
3120         return false;
3121
3122     cl_command_queue qq = getQueue(q);
3123     if (haveTempDstUMats)
3124         sync = true;
3125     if (haveTempSrcUMats)
3126         sync = true;
3127     if (timeNS)
3128         sync = true;
3129     cl_event asyncEvent = 0;
3130     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3131                                            NULL, globalsize, localsize, 0, 0,
3132                                            (sync && !timeNS) ? 0 : &asyncEvent);
3133 #if !CV_OPENCL_SHOW_RUN_KERNELS
3134     if (retval != CL_SUCCESS)
3135 #endif
3136     {
3137         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%dx%dx%d, localsize=%s) sync=%s", name.c_str(), (int)dims,
3138                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3139                         (localsize ? cv::format("%dx%dx%d", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3140                         sync ? "true" : "false"
3141                         );
3142         if (retval != CL_SUCCESS)
3143         {
3144             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3145         }
3146 #if CV_OPENCL_TRACE_CHECK
3147         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3148 #else
3149         printf("%s\n", msg.c_str());
3150         fflush(stdout);
3151 #endif
3152     }
3153     if (sync || retval != CL_SUCCESS)
3154     {
3155         CV_OCL_DBG_CHECK(clFinish(qq));
3156         if (timeNS)
3157         {
3158             if (retval == CL_SUCCESS)
3159             {
3160                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3161                 cl_ulong startTime, stopTime;
3162                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3163                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3164                 *timeNS = (int64)(stopTime - startTime);
3165             }
3166             else
3167             {
3168                 *timeNS = -1;
3169             }
3170         }
3171         cleanupUMats();
3172     }
3173     else
3174     {
3175         addref();
3176         isInProgress = true;
3177         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3178     }
3179     if (asyncEvent)
3180         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3181     return retval == CL_SUCCESS;
3182 }
3183
3184 bool Kernel::runTask(bool sync, const Queue& q)
3185 {
3186     if(!p || !p->handle || p->isInProgress)
3187         return false;
3188
3189     cl_command_queue qq = getQueue(q);
3190     cl_event asyncEvent = 0;
3191     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3192     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3193     if (sync || retval != CL_SUCCESS)
3194     {
3195         CV_OCL_DBG_CHECK(clFinish(qq));
3196         p->cleanupUMats();
3197     }
3198     else
3199     {
3200         p->addref();
3201         p->isInProgress = true;
3202         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3203     }
3204     if (asyncEvent)
3205         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3206     return retval == CL_SUCCESS;
3207 }
3208
3209 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3210 {
3211     CV_Assert(p && p->handle && !p->isInProgress);
3212     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3213     CV_Assert(q.ptr());
3214     q.finish(); // call clFinish() on base queue
3215     Queue profilingQueue = q.getProfilingQueue();
3216     int64 timeNs = -1;
3217     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3218     return res ? timeNs : -1;
3219 }
3220
3221 size_t Kernel::workGroupSize() const
3222 {
3223     if(!p || !p->handle)
3224         return 0;
3225     size_t val = 0, retsz = 0;
3226     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3227     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3228     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3229     return status == CL_SUCCESS ? val : 0;
3230 }
3231
3232 size_t Kernel::preferedWorkGroupSizeMultiple() const
3233 {
3234     if(!p || !p->handle)
3235         return 0;
3236     size_t val = 0, retsz = 0;
3237     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3238     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3239     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3240     return status == CL_SUCCESS ? val : 0;
3241 }
3242
3243 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3244 {
3245     if(!p || !p->handle || !wsz)
3246         return 0;
3247     size_t retsz = 0;
3248     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3249     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3250     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3251     return status == CL_SUCCESS;
3252 }
3253
3254 size_t Kernel::localMemSize() const
3255 {
3256     if(!p || !p->handle)
3257         return 0;
3258     size_t retsz = 0;
3259     cl_ulong val = 0;
3260     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3261     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3262     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3263     return status == CL_SUCCESS ? (size_t)val : 0;
3264 }
3265
3266
3267
3268 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3269
3270 struct ProgramSource::Impl
3271 {
3272     IMPLEMENT_REFCOUNTABLE();
3273
3274     enum KIND {
3275         PROGRAM_SOURCE_CODE = 0,
3276         PROGRAM_BINARIES,
3277         PROGRAM_SPIR,
3278         PROGRAM_SPIRV
3279     } kind_;
3280
3281     Impl(const String& src)
3282     {
3283         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3284         initFromSource(src, cv::String());
3285     }
3286     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3287     {
3288         init(PROGRAM_SOURCE_CODE, module, name);
3289         initFromSource(codeStr, codeHash);
3290     }
3291
3292     /// reset fields
3293     void init(enum KIND kind, const String& module, const String& name)
3294     {
3295         refcount = 1;
3296         kind_ = kind;
3297         module_ = module;
3298         name_ = name;
3299
3300         sourceAddr_ = NULL;
3301         sourceSize_ = 0;
3302         isHashUpdated = false;
3303     }
3304
3305     void initFromSource(const String& codeStr, const String& codeHash)
3306     {
3307         codeStr_ = codeStr;
3308         sourceHash_ = codeHash;
3309         if (sourceHash_.empty())
3310         {
3311             updateHash();
3312         }
3313         else
3314         {
3315             isHashUpdated = true;
3316         }
3317     }
3318
3319     void updateHash(const char* hashStr = NULL)
3320     {
3321         if (hashStr)
3322         {
3323             sourceHash_ = cv::String(hashStr);
3324             isHashUpdated = true;
3325             return;
3326         }
3327         uint64 hash = 0;
3328         switch (kind_)
3329         {
3330         case PROGRAM_SOURCE_CODE:
3331             if (sourceAddr_)
3332             {
3333                 CV_Assert(codeStr_.empty());
3334                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3335             }
3336             else
3337             {
3338                 CV_Assert(!codeStr_.empty());
3339                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3340             }
3341             break;
3342         case PROGRAM_BINARIES:
3343         case PROGRAM_SPIR:
3344         case PROGRAM_SPIRV:
3345             hash = crc64(sourceAddr_, sourceSize_);
3346             break;
3347         default:
3348             CV_Error(Error::StsInternal, "Internal error");
3349         }
3350         sourceHash_ = cv::format("%08llx", hash);
3351         isHashUpdated = true;
3352     }
3353
3354     Impl(enum KIND kind,
3355             const String& module, const String& name,
3356             const unsigned char* binary, const size_t size,
3357             const cv::String& buildOptions = cv::String())
3358     {
3359         init(kind, module, name);
3360
3361         sourceAddr_ = binary;
3362         sourceSize_ = size;
3363
3364         buildOptions_ = buildOptions;
3365     }
3366
3367     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
3368             const char* sourceCodeStaticStr, const char* hashStaticStr,
3369             const cv::String& buildOptions)
3370     {
3371         ProgramSource result;
3372         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
3373                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
3374         result.p->updateHash(hashStaticStr);
3375         return result;
3376     }
3377
3378     static ProgramSource fromBinary(const String& module, const String& name,
3379             const unsigned char* binary, const size_t size,
3380             const cv::String& buildOptions)
3381     {
3382         ProgramSource result;
3383         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
3384         return result;
3385     }
3386
3387     static ProgramSource fromSPIR(const String& module, const String& name,
3388             const unsigned char* binary, const size_t size,
3389             const cv::String& buildOptions)
3390     {
3391         ProgramSource result;
3392         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
3393         return result;
3394     }
3395
3396     String module_;
3397     String name_;
3398
3399     // TODO std::vector<ProgramSource> includes_;
3400     String codeStr_; // PROGRAM_SOURCE_CODE only
3401
3402     const unsigned char* sourceAddr_;
3403     size_t sourceSize_;
3404
3405     cv::String buildOptions_;
3406
3407     String sourceHash_;
3408     bool isHashUpdated;
3409
3410     friend struct Program::Impl;
3411     friend struct internal::ProgramEntry;
3412     friend struct Context::Impl;
3413 };
3414
3415
3416 ProgramSource::ProgramSource()
3417 {
3418     p = 0;
3419 }
3420
3421 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
3422 {
3423     p = new Impl(module, name, codeStr, codeHash);
3424 }
3425
3426 ProgramSource::ProgramSource(const char* prog)
3427 {
3428     p = new Impl(prog);
3429 }
3430
3431 ProgramSource::ProgramSource(const String& prog)
3432 {
3433     p = new Impl(prog);
3434 }
3435
3436 ProgramSource::~ProgramSource()
3437 {
3438     if(p)
3439         p->release();
3440 }
3441
3442 ProgramSource::ProgramSource(const ProgramSource& prog)
3443 {
3444     p = prog.p;
3445     if(p)
3446         p->addref();
3447 }
3448
3449 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3450 {
3451     Impl* newp = (Impl*)prog.p;
3452     if(newp)
3453         newp->addref();
3454     if(p)
3455         p->release();
3456     p = newp;
3457     return *this;
3458 }
3459
3460 const String& ProgramSource::source() const
3461 {
3462     CV_Assert(p);
3463     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
3464     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
3465     return p->codeStr_;
3466 }
3467
3468 ProgramSource::hash_t ProgramSource::hash() const
3469 {
3470     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
3471 }
3472
3473 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
3474         const unsigned char* binary, const size_t size,
3475         const cv::String& buildOptions)
3476 {
3477     CV_Assert(binary);
3478     CV_Assert(size > 0);
3479     return Impl::fromBinary(module, name, binary, size, buildOptions);
3480 }
3481
3482 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
3483         const unsigned char* binary, const size_t size,
3484         const cv::String& buildOptions)
3485 {
3486     CV_Assert(binary);
3487     CV_Assert(size > 0);
3488     return Impl::fromBinary(module, name, binary, size, buildOptions);
3489 }
3490
3491
3492 internal::ProgramEntry::operator ProgramSource&() const
3493 {
3494     if (this->pProgramSource == NULL)
3495     {
3496         cv::AutoLock lock(cv::getInitializationMutex());
3497         if (this->pProgramSource == NULL)
3498         {
3499             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
3500             ProgramSource* ptr = new ProgramSource(ps);
3501             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
3502         }
3503     }
3504     return *this->pProgramSource;
3505 }
3506
3507
3508
3509 /////////////////////////////////////////// Program /////////////////////////////////////////////
3510
3511 static
3512 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
3513 {
3514     if (b.empty())
3515         return a;
3516     if (a.empty())
3517         return b;
3518     if (b[0] == ' ')
3519         return a + b;
3520     return a + (cv::String(" ") + b);
3521 }
3522
3523 struct Program::Impl
3524 {
3525     IMPLEMENT_REFCOUNTABLE();
3526
3527     Impl(const ProgramSource& src,
3528          const String& _buildflags, String& errmsg) :
3529          refcount(1),
3530          handle(NULL),
3531          buildflags(_buildflags)
3532     {
3533         const ProgramSource::Impl* src_ = src.getImpl();
3534         CV_Assert(src_);
3535         sourceModule_ = src_->module_;
3536         sourceName_ = src_->name_;
3537         const Context ctx = Context::getDefault();
3538         Device device = ctx.device(0);
3539         if (ctx.ptr() == NULL || device.ptr() == NULL)
3540             return;
3541         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
3542         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3543         {
3544             if (device.isAMD())
3545                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
3546             else if (device.isIntel())
3547                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
3548             const String param_buildExtraOptions = getBuildExtraOptions();
3549             if (!param_buildExtraOptions.empty())
3550                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
3551         }
3552         compile(ctx, src_, errmsg);
3553     }
3554
3555     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3556     {
3557         CV_Assert(ctx.getImpl());
3558         CV_Assert(src_);
3559
3560         // We don't cache OpenCL binaries
3561         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
3562         {
3563             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3564             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3565             return isLoaded;
3566         }
3567         return compileWithCache(ctx, src_, errmsg);
3568     }
3569
3570     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3571     {
3572         CV_Assert(ctx.getImpl());
3573         CV_Assert(src_);
3574         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
3575
3576 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3577         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
3578         const std::string base_dir = config.prepareCacheDirectoryForContext(
3579                 ctx.getImpl()->getPrefixString(),
3580                 ctx.getImpl()->getPrefixBase()
3581         );
3582         const String& hash_str = src_->sourceHash_;
3583         cv::String fname;
3584         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
3585         {
3586             CV_Assert(!hash_str.empty());
3587             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
3588             fname = utils::fs::join(base_dir, fname);
3589         }
3590         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
3591         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
3592         {
3593             try
3594             {
3595                 std::vector<char> binaryBuf;
3596                 bool res = false;
3597                 {
3598                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3599                     BinaryProgramFile file(fname, hash_str.c_str());
3600                     res = file.read(buildflags, binaryBuf);
3601                 }
3602                 if (res)
3603                 {
3604                     CV_Assert(!binaryBuf.empty());
3605                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
3606                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
3607                     if (isLoaded)
3608                         return true;
3609                 }
3610             }
3611             catch (const cv::Exception& e)
3612             {
3613                 CV_UNUSED(e);
3614                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
3615             }
3616             catch (...)
3617             {
3618                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
3619             }
3620         }
3621 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3622         CV_Assert(handle == NULL);
3623         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
3624         {
3625             if (!buildFromSources(ctx, src_, errmsg))
3626             {
3627                 return false;
3628             }
3629         }
3630         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
3631         {
3632             buildflags = joinBuildOptions(buildflags, " -x spir");
3633             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
3634             {
3635                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
3636             }
3637             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
3638             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
3639             if (!isLoaded)
3640                 return false;
3641         }
3642         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
3643         {
3644             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
3645         }
3646         else
3647         {
3648             CV_Error(Error::StsInternal, "Internal error");
3649         }
3650         CV_Assert(handle != NULL);
3651 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
3652         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
3653         {
3654             try
3655             {
3656                 std::vector<char> binaryBuf;
3657                 getProgramBinary(binaryBuf);
3658                 {
3659                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
3660                     BinaryProgramFile file(fname, hash_str.c_str());
3661                     file.write(buildflags, binaryBuf);
3662                 }
3663             }
3664             catch (const cv::Exception& e)
3665             {
3666                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
3667             }
3668             catch (...)
3669             {
3670                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
3671             }
3672         }
3673 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
3674 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3675         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3676         {
3677             std::vector<char> binaryBuf;
3678             getProgramBinary(binaryBuf);
3679             if (!binaryBuf.empty())
3680             {
3681                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3682                 handle = NULL;
3683                 createFromBinary(ctx, binaryBuf, errmsg);
3684             }
3685         }
3686 #endif
3687         return handle != NULL;
3688     }
3689
3690     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
3691     {
3692         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
3693
3694         size_t retsz = 0;
3695         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3696                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3697         if (log_retval == CL_SUCCESS && retsz > 1)
3698         {
3699             buffer.resize(retsz + 16);
3700             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
3701                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
3702             if (log_retval == CL_SUCCESS)
3703             {
3704                 if (retsz < buffer.size())
3705                     buffer[retsz] = 0;
3706                 else
3707                     buffer[buffer.size() - 1] = 0;
3708             }
3709             else
3710             {
3711                 buffer[0] = 0;
3712             }
3713         }
3714
3715         errmsg = String(buffer.data());
3716         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
3717                 sourceModule_.c_str(), sourceName_.c_str(),
3718                 result, getOpenCLErrorString(result),
3719                 buildflags.c_str(), errmsg.c_str());
3720         fflush(stdout);
3721     }
3722
3723     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
3724     {
3725         CV_Assert(src_);
3726         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
3727         CV_Assert(handle == NULL);
3728         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
3729                 sourceModule_.c_str(), sourceName_.c_str(),
3730                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
3731
3732         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
3733
3734         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
3735         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
3736         CV_Assert(srcptr != NULL);
3737         CV_Assert(srclen > 0);
3738
3739         cl_int retval = 0;
3740
3741         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3742         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
3743         CV_Assert(handle || retval != CL_SUCCESS);
3744         if (handle && retval == CL_SUCCESS)
3745         {
3746             size_t n = ctx.ndevices();
3747             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
3748             cl_device_id* deviceList = deviceListBuf.data();
3749             for (size_t i = 0; i < n; i++)
3750             {
3751                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
3752             }
3753
3754             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
3755             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
3756 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3757             if (retval != CL_SUCCESS)
3758 #endif
3759             {
3760                 dumpBuildLog_(retval, deviceList, errmsg);
3761
3762                 // don't remove "retval != CL_SUCCESS" condition here:
3763                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
3764                 if (retval != CL_SUCCESS && handle)
3765                 {
3766                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3767                     handle = NULL;
3768                 }
3769             }
3770 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3771             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3772             {
3773                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
3774                 size_t retsz = 0;
3775                 char kernels_buffer[4096] = {0};
3776                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3777                 if (retsz < sizeof(kernels_buffer))
3778                     kernels_buffer[retsz] = 0;
3779                 else
3780                     kernels_buffer[0] = 0;
3781                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3782             }
3783 #endif
3784
3785         }
3786         return handle != NULL;
3787     }
3788
3789     void getProgramBinary(std::vector<char>& buf)
3790     {
3791         CV_Assert(handle);
3792         size_t sz = 0;
3793         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
3794         buf.resize(sz);
3795         uchar* ptr = (uchar*)&buf[0];
3796         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
3797     }
3798
3799     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
3800     {
3801         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
3802     }
3803
3804     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
3805     {
3806         CV_Assert(handle == NULL);
3807         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
3808         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
3809
3810         CV_Assert(binarySize > 0);
3811
3812         size_t ndevices = (int)ctx.ndevices();
3813         AutoBuffer<cl_device_id> devices_(ndevices);
3814         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
3815         AutoBuffer<size_t> binarySizes_(ndevices);
3816
3817         cl_device_id* devices = devices_.data();
3818         const uchar** binaryPtrs = binaryPtrs_.data();
3819         size_t* binarySizes = binarySizes_.data();
3820         for (size_t i = 0; i < ndevices; i++)
3821         {
3822             devices[i] = (cl_device_id)ctx.device(i).ptr();
3823             binaryPtrs[i] = binaryAddr;
3824             binarySizes[i] = binarySize;
3825         }
3826
3827         cl_int result = 0;
3828         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
3829                                            binarySizes, binaryPtrs, NULL, &result);
3830         if (result != CL_SUCCESS)
3831         {
3832             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
3833             if (handle)
3834             {
3835                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3836                 handle = NULL;
3837             }
3838         }
3839         if (!handle)
3840         {
3841             return false;
3842         }
3843         // call clBuildProgram()
3844         {
3845             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
3846             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
3847             if (result != CL_SUCCESS)
3848             {
3849                 dumpBuildLog_(result, devices, errmsg);
3850                 if (handle)
3851                 {
3852                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3853                     handle = NULL;
3854                 }
3855                 return false;
3856             }
3857         }
3858         // check build status
3859         {
3860             cl_build_status build_status = CL_BUILD_NONE;
3861             size_t retsz = 0;
3862             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
3863                     sizeof(build_status), &build_status, &retsz));
3864             if (result == CL_SUCCESS)
3865             {
3866                 if (build_status == CL_BUILD_SUCCESS)
3867                 {
3868                     return true;
3869                 }
3870                 else
3871                 {
3872                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
3873                     return false;
3874                 }
3875             }
3876             else
3877             {
3878                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
3879                 if (handle)
3880                 {
3881                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
3882                     handle = NULL;
3883                 }
3884             }
3885         }
3886 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
3887         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
3888         {
3889             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
3890             size_t retsz = 0;
3891             char kernels_buffer[4096] = {0};
3892             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
3893             if (retsz < sizeof(kernels_buffer))
3894                 kernels_buffer[retsz] = 0;
3895             else
3896                 kernels_buffer[0] = 0;
3897             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
3898         }
3899 #endif
3900         return handle != NULL;
3901     }
3902
3903     ~Impl()
3904     {
3905         if( handle )
3906         {
3907 #ifdef _WIN32
3908             if (!cv::__termination)
3909 #endif
3910             {
3911                 clReleaseProgram(handle);
3912             }
3913             handle = NULL;
3914         }
3915     }
3916
3917     cl_program handle;
3918
3919     String buildflags;
3920     String sourceModule_;
3921     String sourceName_;
3922 };
3923
3924
3925 Program::Program() { p = 0; }
3926
3927 Program::Program(const ProgramSource& src,
3928         const String& buildflags, String& errmsg)
3929 {
3930     p = 0;
3931     create(src, buildflags, errmsg);
3932 }
3933
3934 Program::Program(const Program& prog)
3935 {
3936     p = prog.p;
3937     if(p)
3938         p->addref();
3939 }
3940
3941 Program& Program::operator = (const Program& prog)
3942 {
3943     Impl* newp = (Impl*)prog.p;
3944     if(newp)
3945         newp->addref();
3946     if(p)
3947         p->release();
3948     p = newp;
3949     return *this;
3950 }
3951
3952 Program::~Program()
3953 {
3954     if(p)
3955         p->release();
3956 }
3957
3958 bool Program::create(const ProgramSource& src,
3959             const String& buildflags, String& errmsg)
3960 {
3961     if(p)
3962     {
3963         p->release();
3964         p = NULL;
3965     }
3966     p = new Impl(src, buildflags, errmsg);
3967     if(!p->handle)
3968     {
3969         p->release();
3970         p = 0;
3971     }
3972     return p != 0;
3973 }
3974
3975 void* Program::ptr() const
3976 {
3977     return p ? p->handle : 0;
3978 }
3979
3980 #ifndef OPENCV_REMOVE_DEPRECATED_API
3981 const ProgramSource& Program::source() const
3982 {
3983     CV_Error(Error::StsNotImplemented, "Removed API");
3984 }
3985
3986 bool Program::read(const String& bin, const String& buildflags)
3987 {
3988     CV_UNUSED(bin); CV_UNUSED(buildflags);
3989     CV_Error(Error::StsNotImplemented, "Removed API");
3990 }
3991
3992 bool Program::write(String& bin) const
3993 {
3994     CV_UNUSED(bin);
3995     CV_Error(Error::StsNotImplemented, "Removed API");
3996 }
3997
3998 String Program::getPrefix() const
3999 {
4000     if(!p)
4001         return String();
4002     Context::Impl* ctx_ = Context::getDefault().getImpl();
4003     CV_Assert(ctx_);
4004     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4005 }
4006
4007 String Program::getPrefix(const String& buildflags)
4008 {
4009         Context::Impl* ctx_ = Context::getDefault().getImpl();
4010         CV_Assert(ctx_);
4011         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4012 }
4013 #endif // OPENCV_REMOVE_DEPRECATED_API
4014
4015 void Program::getBinary(std::vector<char>& binary) const
4016 {
4017     CV_Assert(p && "Empty program");
4018     p->getProgramBinary(binary);
4019 }
4020
4021 Program Context::Impl::getProg(const ProgramSource& src,
4022                                const String& buildflags, String& errmsg)
4023 {
4024     size_t limit = getProgramCountLimit();
4025     const ProgramSource::Impl* src_ = src.getImpl();
4026     CV_Assert(src_);
4027     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4028             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4029             getPrefixString().c_str(),
4030             buildflags.c_str());
4031     {
4032         cv::AutoLock lock(program_cache_mutex);
4033         phash_t::iterator it = phash.find(key);
4034         if (it != phash.end())
4035         {
4036             // TODO LRU cache
4037             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4038             if (i != cacheList.end() && i != cacheList.begin())
4039             {
4040                 cacheList.erase(i);
4041                 cacheList.push_front(key);
4042             }
4043             return it->second;
4044         }
4045         { // cleanup program cache
4046             size_t sz = phash.size();
4047             if (limit > 0 && sz >= limit)
4048             {
4049                 static bool warningFlag = false;
4050                 if (!warningFlag)
4051                 {
4052                     printf("\nWARNING: OpenCV-OpenCL:\n"
4053                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4054                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4055                     warningFlag = true;
4056                 }
4057                 while (!cacheList.empty())
4058                 {
4059                     size_t c = phash.erase(cacheList.back());
4060                     cacheList.pop_back();
4061                     if (c != 0)
4062                         break;
4063                 }
4064             }
4065         }
4066     }
4067     Program prog(src, buildflags, errmsg);
4068     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4069     {
4070         cv::AutoLock lock(program_cache_mutex);
4071         phash.insert(std::pair<std::string, Program>(key, prog));
4072         cacheList.push_front(key);
4073     }
4074     return prog;
4075 }
4076
4077
4078 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4079
4080 template<typename T>
4081 class OpenCLBufferPool
4082 {
4083 protected:
4084     ~OpenCLBufferPool() { }
4085 public:
4086     virtual T allocate(size_t size) = 0;
4087     virtual void release(T buffer) = 0;
4088 };
4089
4090 template <typename Derived, typename BufferEntry, typename T>
4091 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4092 {
4093 private:
4094     inline Derived& derived() { return *static_cast<Derived*>(this); }
4095 protected:
4096     Mutex mutex_;
4097
4098     size_t currentReservedSize;
4099     size_t maxReservedSize;
4100
4101     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4102     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4103
4104     // synchronized
4105     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4106     {
4107         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4108         for (; i != allocatedEntries_.end(); ++i)
4109         {
4110             BufferEntry& e = *i;
4111             if (e.clBuffer_ == buffer)
4112             {
4113                 entry = e;
4114                 allocatedEntries_.erase(i);
4115                 return true;
4116             }
4117         }
4118         return false;
4119     }
4120
4121     // synchronized
4122     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4123     {
4124         if (reservedEntries_.empty())
4125             return false;
4126         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4127         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4128         BufferEntry result;
4129         size_t minDiff = (size_t)(-1);
4130         for (; i != reservedEntries_.end(); ++i)
4131         {
4132             BufferEntry& e = *i;
4133             if (e.capacity_ >= size)
4134             {
4135                 size_t diff = e.capacity_ - size;
4136                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4137                 {
4138                     minDiff = diff;
4139                     result_pos = i;
4140                     result = e;
4141                     if (diff == 0)
4142                         break;
4143                 }
4144             }
4145         }
4146         if (result_pos != reservedEntries_.end())
4147         {
4148             //CV_DbgAssert(result == *result_pos);
4149             reservedEntries_.erase(result_pos);
4150             entry = result;
4151             currentReservedSize -= entry.capacity_;
4152             allocatedEntries_.push_back(entry);
4153             return true;
4154         }
4155         return false;
4156     }
4157
4158     // synchronized
4159     void _checkSizeOfReservedEntries()
4160     {
4161         while (currentReservedSize > maxReservedSize)
4162         {
4163             CV_DbgAssert(!reservedEntries_.empty());
4164             const BufferEntry& entry = reservedEntries_.back();
4165             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4166             currentReservedSize -= entry.capacity_;
4167             derived()._releaseBufferEntry(entry);
4168             reservedEntries_.pop_back();
4169         }
4170     }
4171
4172     inline size_t _allocationGranularity(size_t size)
4173     {
4174         // heuristic values
4175         if (size < 1024*1024)
4176             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4177         else if (size < 16*1024*1024)
4178             return 64*1024;
4179         else
4180             return 1024*1024;
4181     }
4182
4183 public:
4184     OpenCLBufferPoolBaseImpl()
4185         : currentReservedSize(0),
4186           maxReservedSize(0)
4187     {
4188         // nothing
4189     }
4190     virtual ~OpenCLBufferPoolBaseImpl()
4191     {
4192         freeAllReservedBuffers();
4193         CV_Assert(reservedEntries_.empty());
4194     }
4195 public:
4196     virtual T allocate(size_t size) CV_OVERRIDE
4197     {
4198         AutoLock locker(mutex_);
4199         BufferEntry entry;
4200         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4201         {
4202             CV_DbgAssert(size <= entry.capacity_);
4203             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4204         }
4205         else
4206         {
4207             derived()._allocateBufferEntry(entry, size);
4208         }
4209         return entry.clBuffer_;
4210     }
4211     virtual void release(T buffer) CV_OVERRIDE
4212     {
4213         AutoLock locker(mutex_);
4214         BufferEntry entry;
4215         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4216         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4217         {
4218             derived()._releaseBufferEntry(entry);
4219         }
4220         else
4221         {
4222             reservedEntries_.push_front(entry);
4223             currentReservedSize += entry.capacity_;
4224             _checkSizeOfReservedEntries();
4225         }
4226     }
4227
4228     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4229     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4230     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4231     {
4232         AutoLock locker(mutex_);
4233         size_t oldMaxReservedSize = maxReservedSize;
4234         maxReservedSize = size;
4235         if (maxReservedSize < oldMaxReservedSize)
4236         {
4237             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4238             for (; i != reservedEntries_.end();)
4239             {
4240                 const BufferEntry& entry = *i;
4241                 if (entry.capacity_ > maxReservedSize / 8)
4242                 {
4243                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4244                     currentReservedSize -= entry.capacity_;
4245                     derived()._releaseBufferEntry(entry);
4246                     i = reservedEntries_.erase(i);
4247                     continue;
4248                 }
4249                 ++i;
4250             }
4251             _checkSizeOfReservedEntries();
4252         }
4253     }
4254     virtual void freeAllReservedBuffers() CV_OVERRIDE
4255     {
4256         AutoLock locker(mutex_);
4257         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4258         for (; i != reservedEntries_.end(); ++i)
4259         {
4260             const BufferEntry& entry = *i;
4261             derived()._releaseBufferEntry(entry);
4262         }
4263         reservedEntries_.clear();
4264         currentReservedSize = 0;
4265     }
4266 };
4267
4268 struct CLBufferEntry
4269 {
4270     cl_mem clBuffer_;
4271     size_t capacity_;
4272     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4273 };
4274
4275 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4276 {
4277 public:
4278     typedef struct CLBufferEntry BufferEntry;
4279 protected:
4280     int createFlags_;
4281 public:
4282     OpenCLBufferPoolImpl(int createFlags = 0)
4283         : createFlags_(createFlags)
4284     {
4285     }
4286
4287     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4288     {
4289         CV_DbgAssert(entry.clBuffer_ == NULL);
4290         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4291         Context& ctx = Context::getDefault();
4292         cl_int retval = CL_SUCCESS;
4293         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4294         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4295         CV_Assert(entry.clBuffer_ != NULL);
4296         if(retval == CL_SUCCESS)
4297         {
4298             CV_IMPL_ADD(CV_IMPL_OCL);
4299         }
4300         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4301                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4302         allocatedEntries_.push_back(entry);
4303     }
4304
4305     void _releaseBufferEntry(const BufferEntry& entry)
4306     {
4307         CV_Assert(entry.capacity_ != 0);
4308         CV_Assert(entry.clBuffer_ != NULL);
4309         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4310                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4311         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4312     }
4313 };
4314
4315 #ifdef HAVE_OPENCL_SVM
4316 struct CLSVMBufferEntry
4317 {
4318     void* clBuffer_;
4319     size_t capacity_;
4320     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4321 };
4322 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4323 {
4324 public:
4325     typedef struct CLSVMBufferEntry BufferEntry;
4326 public:
4327     OpenCLSVMBufferPoolImpl()
4328     {
4329     }
4330
4331     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4332     {
4333         CV_DbgAssert(entry.clBuffer_ == NULL);
4334         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4335
4336         Context& ctx = Context::getDefault();
4337         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4338         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4339         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4340                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4341
4342         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4343         CV_DbgAssert(svmFns->isValid());
4344
4345         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4346         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4347         CV_Assert(buf);
4348
4349         entry.clBuffer_ = buf;
4350         {
4351             CV_IMPL_ADD(CV_IMPL_OCL);
4352         }
4353         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4354                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4355         allocatedEntries_.push_back(entry);
4356     }
4357
4358     void _releaseBufferEntry(const BufferEntry& entry)
4359     {
4360         CV_Assert(entry.capacity_ != 0);
4361         CV_Assert(entry.clBuffer_ != NULL);
4362         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4363                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4364         Context& ctx = Context::getDefault();
4365         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4366         CV_DbgAssert(svmFns->isValid());
4367         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
4368         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
4369     }
4370 };
4371 #endif
4372
4373
4374
4375 template <bool readAccess, bool writeAccess>
4376 class AlignedDataPtr
4377 {
4378 protected:
4379     const size_t size_;
4380     uchar* const originPtr_;
4381     const size_t alignment_;
4382     uchar* ptr_;
4383     uchar* allocatedPtr_;
4384
4385 public:
4386     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
4387         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
4388     {
4389         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4390         CV_DbgAssert(!readAccess || ptr);
4391         if (((size_t)ptr_ & (alignment - 1)) != 0)
4392         {
4393             allocatedPtr_ = new uchar[size_ + alignment - 1];
4394             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4395             if (readAccess)
4396             {
4397                 memcpy(ptr_, originPtr_, size_);
4398             }
4399         }
4400     }
4401
4402     uchar* getAlignedPtr() const
4403     {
4404         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4405         return ptr_;
4406     }
4407
4408     ~AlignedDataPtr()
4409     {
4410         if (allocatedPtr_)
4411         {
4412             if (writeAccess)
4413             {
4414                 memcpy(originPtr_, ptr_, size_);
4415             }
4416             delete[] allocatedPtr_;
4417             allocatedPtr_ = NULL;
4418         }
4419         ptr_ = NULL;
4420     }
4421 private:
4422     AlignedDataPtr(const AlignedDataPtr&); // disabled
4423     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
4424 };
4425
4426 template <bool readAccess, bool writeAccess>
4427 class AlignedDataPtr2D
4428 {
4429 protected:
4430     const size_t size_;
4431     uchar* const originPtr_;
4432     const size_t alignment_;
4433     uchar* ptr_;
4434     uchar* allocatedPtr_;
4435     size_t rows_;
4436     size_t cols_;
4437     size_t step_;
4438
4439 public:
4440     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
4441         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
4442     {
4443         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
4444         CV_DbgAssert(!readAccess || ptr != NULL);
4445         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
4446         {
4447             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
4448             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
4449             if (readAccess)
4450             {
4451                 for (size_t i = 0; i < rows_; i++)
4452                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
4453             }
4454         }
4455     }
4456
4457     uchar* getAlignedPtr() const
4458     {
4459         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
4460         return ptr_;
4461     }
4462
4463     ~AlignedDataPtr2D()
4464     {
4465         if (allocatedPtr_)
4466         {
4467             if (writeAccess)
4468             {
4469                 for (size_t i = 0; i < rows_; i++)
4470                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
4471             }
4472             delete[] allocatedPtr_;
4473             allocatedPtr_ = NULL;
4474         }
4475         ptr_ = NULL;
4476     }
4477 private:
4478     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
4479     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
4480 };
4481
4482 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
4483 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
4484 #endif
4485
4486 class OpenCLAllocator CV_FINAL : public MatAllocator
4487 {
4488     mutable OpenCLBufferPoolImpl bufferPool;
4489     mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
4490 #ifdef  HAVE_OPENCL_SVM
4491     mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
4492 #endif
4493
4494 public:
4495     enum AllocatorFlags
4496     {
4497         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
4498         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
4499 #ifdef HAVE_OPENCL_SVM
4500         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
4501 #endif
4502         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
4503     };
4504
4505     OpenCLAllocator()
4506         : bufferPool(0),
4507           bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
4508     {
4509         size_t defaultPoolSize, poolSize;
4510         defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
4511         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
4512         bufferPool.setMaxReservedSize(poolSize);
4513         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
4514         bufferPoolHostPtr.setMaxReservedSize(poolSize);
4515 #ifdef HAVE_OPENCL_SVM
4516         poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
4517         bufferPoolSVM.setMaxReservedSize(poolSize);
4518 #endif
4519
4520         matStdAllocator = Mat::getDefaultAllocator();
4521     }
4522     ~OpenCLAllocator()
4523     {
4524         flushCleanupQueue();
4525     }
4526
4527     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
4528             int flags, UMatUsageFlags usageFlags) const
4529     {
4530         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
4531         return u;
4532     }
4533
4534     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
4535     {
4536         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
4537         return value;
4538     }
4539     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
4540     {
4541         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
4542         return value;
4543     }
4544
4545     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
4546     {
4547         const Device& dev = ctx.device(0);
4548         createFlags = 0;
4549         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
4550             createFlags |= CL_MEM_ALLOC_HOST_PTR;
4551
4552         if (!isOpenCLCopyingForced() &&
4553             (isOpenCLMapForced() ||
4554                 (dev.hostUnifiedMemory()
4555 #ifndef __APPLE__
4556                 || dev.isIntel()
4557 #endif
4558                 )
4559             )
4560         )
4561             flags0 = 0;
4562         else
4563             flags0 = UMatData::COPY_ON_MAP;
4564     }
4565
4566     UMatData* allocate(int dims, const int* sizes, int type,
4567                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4568     {
4569         if(!useOpenCL())
4570             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4571         CV_Assert(data == 0);
4572         size_t total = CV_ELEM_SIZE(type);
4573         for( int i = dims-1; i >= 0; i-- )
4574         {
4575             if( step )
4576                 step[i] = total;
4577             total *= sizes[i];
4578         }
4579
4580         Context& ctx = Context::getDefault();
4581         flushCleanupQueue();
4582
4583         int createFlags = 0, flags0 = 0;
4584         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
4585
4586         void* handle = NULL;
4587         int allocatorFlags = 0;
4588
4589 #ifdef HAVE_OPENCL_SVM
4590         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4591         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
4592         {
4593             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
4594             handle = bufferPoolSVM.allocate(total);
4595
4596             // this property is constant, so single buffer pool can be used here
4597             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4598             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4599         }
4600         else
4601 #endif
4602         if (createFlags == 0)
4603         {
4604             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
4605             handle = bufferPool.allocate(total);
4606         }
4607         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
4608         {
4609             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
4610             handle = bufferPoolHostPtr.allocate(total);
4611         }
4612         else
4613         {
4614             CV_Assert(handle != NULL); // Unsupported, throw
4615         }
4616
4617         if (!handle)
4618             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
4619
4620         UMatData* u = new UMatData(this);
4621         u->data = 0;
4622         u->size = total;
4623         u->handle = handle;
4624         u->flags = flags0;
4625         u->allocatorFlags_ = allocatorFlags;
4626         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
4627         u->markHostCopyObsolete(true);
4628         opencl_allocator_stats.onAllocate(u->size);
4629         return u;
4630     }
4631
4632     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
4633     {
4634         if(!u)
4635             return false;
4636
4637         flushCleanupQueue();
4638
4639         UMatDataAutoLock lock(u);
4640
4641         if(u->handle == 0)
4642         {
4643             CV_Assert(u->origdata != 0);
4644             Context& ctx = Context::getDefault();
4645             int createFlags = 0, flags0 = 0;
4646             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
4647
4648             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
4649
4650             cl_context ctx_handle = (cl_context)ctx.ptr();
4651             int allocatorFlags = 0;
4652             int tempUMatFlags = 0;
4653             void* handle = NULL;
4654             cl_int retval = CL_SUCCESS;
4655
4656 #ifdef HAVE_OPENCL_SVM
4657             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4658             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
4659             if (useSVM && svmCaps.isSupportFineGrainSystem())
4660             {
4661                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
4662                 tempUMatFlags = UMatData::TEMP_UMAT;
4663                 handle = u->origdata;
4664                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
4665             }
4666             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
4667             {
4668                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
4669                 {
4670                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4671
4672                     cl_svm_mem_flags memFlags = createFlags |
4673                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4674
4675                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4676                     CV_DbgAssert(svmFns->isValid());
4677
4678                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
4679                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
4680                     CV_Assert(handle);
4681
4682                     cl_command_queue q = NULL;
4683                     if (!isFineGrainBuffer)
4684                     {
4685                         q = (cl_command_queue)Queue::getDefault().ptr();
4686                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
4687                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
4688                                 handle, u->size,
4689                                 0, NULL, NULL);
4690                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4691
4692                     }
4693                     memcpy(handle, u->origdata, u->size);
4694                     if (!isFineGrainBuffer)
4695                     {
4696                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
4697                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
4698                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4699                     }
4700
4701                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
4702                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
4703                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
4704                 }
4705             }
4706             else
4707 #endif
4708             {
4709                 if( copyOnMap )
4710                     accessFlags &= ~ACCESS_FAST;
4711
4712                 tempUMatFlags = UMatData::TEMP_UMAT;
4713                 if (
4714                 #ifdef __APPLE__
4715                     !copyOnMap &&
4716                 #endif
4717                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
4718                     // There are OpenCL runtime issues for less aligned data
4719                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
4720                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
4721                     // Avoid sharing of host memory between OpenCL buffers
4722                     && !(u->originalUMatData && u->originalUMatData->handle)
4723                 )
4724                 {
4725                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
4726                                             u->size, u->origdata, &retval);
4727                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
4728                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4729                 }
4730                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
4731                 {
4732                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
4733                                                u->size, u->origdata, &retval);
4734                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
4735                             (long long int)u->size, u->origdata, (void*)handle).c_str());
4736                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
4737                 }
4738             }
4739             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
4740             if(!handle || retval != CL_SUCCESS)
4741                 return false;
4742             u->handle = handle;
4743             u->prevAllocator = u->currAllocator;
4744             u->currAllocator = this;
4745             u->flags |= tempUMatFlags | flags0;
4746             u->allocatorFlags_ = allocatorFlags;
4747         }
4748         if(accessFlags & ACCESS_WRITE)
4749             u->markHostCopyObsolete(true);
4750         opencl_allocator_stats.onAllocate(u->size);
4751         return true;
4752     }
4753
4754     /*void sync(UMatData* u) const
4755     {
4756         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4757         UMatDataAutoLock lock(u);
4758
4759         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
4760         {
4761             if( u->tempCopiedUMat() )
4762             {
4763                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4764                                     u->size, u->origdata, 0, 0, 0);
4765             }
4766             else
4767             {
4768                 cl_int retval = 0;
4769                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4770                                                 (CL_MAP_READ | CL_MAP_WRITE),
4771                                                 0, u->size, 0, 0, 0, &retval);
4772                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4773                 clFinish(q);
4774             }
4775             u->markHostCopyObsolete(false);
4776         }
4777         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
4778         {
4779             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4780                                  u->size, u->data, 0, 0, 0);
4781         }
4782     }*/
4783
4784     void deallocate(UMatData* u) const CV_OVERRIDE
4785     {
4786         if(!u)
4787             return;
4788
4789         CV_Assert(u->urefcount == 0);
4790         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
4791
4792         CV_Assert(u->handle != 0);
4793         CV_Assert(u->mapcount == 0);
4794
4795         if (u->flags & UMatData::ASYNC_CLEANUP)
4796             addToCleanupQueue(u);
4797         else
4798             deallocate_(u);
4799     }
4800
4801     void deallocate_(UMatData* u) const
4802     {
4803         CV_Assert(u);
4804         CV_Assert(u->handle);
4805         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
4806         {
4807             opencl_allocator_stats.onFree(u->size);
4808         }
4809
4810 #ifdef _WIN32
4811         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
4812             return;             // avoid any OpenCL calls
4813 #endif
4814         if(u->tempUMat())
4815         {
4816             CV_Assert(u->origdata);
4817 //            UMatDataAutoLock lock(u);
4818
4819             if (u->hostCopyObsolete())
4820             {
4821 #ifdef HAVE_OPENCL_SVM
4822                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4823                 {
4824                     Context& ctx = Context::getDefault();
4825                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4826                     CV_DbgAssert(svmFns->isValid());
4827
4828                     if( u->tempCopiedUMat() )
4829                     {
4830                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4831                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
4832                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
4833                         cl_command_queue q = NULL;
4834                         if (!isFineGrainBuffer)
4835                         {
4836                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
4837                             q = (cl_command_queue)Queue::getDefault().ptr();
4838                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
4839                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
4840                                     u->handle, u->size,
4841                                     0, NULL, NULL);
4842                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
4843                         }
4844                         clFinish(q);
4845                         memcpy(u->origdata, u->handle, u->size);
4846                         if (!isFineGrainBuffer)
4847                         {
4848                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4849                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4850                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4851                         }
4852                     }
4853                     else
4854                     {
4855                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
4856                         // nothing
4857                     }
4858                 }
4859                 else
4860 #endif
4861                 {
4862                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4863                     if( u->tempCopiedUMat() )
4864                     {
4865                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
4866                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
4867                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
4868                     }
4869                     else
4870                     {
4871                         cl_int retval = 0;
4872                         if (u->tempUMat())
4873                         {
4874                             CV_Assert(u->mapcount == 0);
4875                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
4876                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
4877                                 (CL_MAP_READ | CL_MAP_WRITE),
4878                                 0, u->size, 0, 0, 0, &retval);
4879                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
4880                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
4881                             if (u->originalUMatData)
4882                             {
4883                                 CV_Assert(u->originalUMatData->data == data);
4884                             }
4885                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
4886                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
4887                             CV_OCL_DBG_CHECK(clFinish(q));
4888                         }
4889                     }
4890                 }
4891                 u->markHostCopyObsolete(false);
4892             }
4893             else
4894             {
4895                 // nothing
4896             }
4897 #ifdef HAVE_OPENCL_SVM
4898             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4899             {
4900                 if( u->tempCopiedUMat() )
4901                 {
4902                     Context& ctx = Context::getDefault();
4903                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4904                     CV_DbgAssert(svmFns->isValid());
4905
4906                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
4907                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
4908                 }
4909             }
4910             else
4911 #endif
4912             {
4913                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
4914                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
4915             }
4916             u->handle = 0;
4917             u->markDeviceCopyObsolete(true);
4918             u->currAllocator = u->prevAllocator;
4919             u->prevAllocator = NULL;
4920             if(u->data && u->copyOnMap() && u->data != u->origdata)
4921                 fastFree(u->data);
4922             u->data = u->origdata;
4923             u->currAllocator->deallocate(u);
4924             u = NULL;
4925         }
4926         else
4927         {
4928             CV_Assert(u->origdata == NULL);
4929             if(u->data && u->copyOnMap() && u->data != u->origdata)
4930             {
4931                 fastFree(u->data);
4932                 u->data = 0;
4933                 u->markHostCopyObsolete(true);
4934             }
4935             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
4936             {
4937                 bufferPool.release((cl_mem)u->handle);
4938             }
4939             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
4940             {
4941                 bufferPoolHostPtr.release((cl_mem)u->handle);
4942             }
4943 #ifdef HAVE_OPENCL_SVM
4944             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
4945             {
4946                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
4947                 {
4948                     //nothing
4949                 }
4950                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
4951                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
4952                 {
4953                     Context& ctx = Context::getDefault();
4954                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4955                     CV_DbgAssert(svmFns->isValid());
4956                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4957
4958                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
4959                     {
4960                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
4961                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
4962                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
4963                     }
4964                 }
4965                 bufferPoolSVM.release((void*)u->handle);
4966             }
4967 #endif
4968             else
4969             {
4970                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
4971             }
4972             u->handle = 0;
4973             u->markDeviceCopyObsolete(true);
4974             delete u;
4975             u = NULL;
4976         }
4977         CV_Assert(u == NULL);
4978     }
4979
4980     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
4981     void map(UMatData* u, int accessFlags) const CV_OVERRIDE
4982     {
4983         CV_Assert(u && u->handle);
4984
4985         if(accessFlags & ACCESS_WRITE)
4986             u->markDeviceCopyObsolete(true);
4987
4988         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4989
4990         {
4991             if( !u->copyOnMap() )
4992             {
4993                 // TODO
4994                 // because there can be other map requests for the same UMat with different access flags,
4995                 // we use the universal (read-write) access mode.
4996 #ifdef HAVE_OPENCL_SVM
4997                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
4998                 {
4999                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5000                     {
5001                         Context& ctx = Context::getDefault();
5002                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5003                         CV_DbgAssert(svmFns->isValid());
5004
5005                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5006                         {
5007                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5008                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5009                                     u->handle, u->size,
5010                                     0, NULL, NULL);
5011                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5012                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5013                         }
5014                     }
5015                     clFinish(q);
5016                     u->data = (uchar*)u->handle;
5017                     u->markHostCopyObsolete(false);
5018                     u->markDeviceMemMapped(true);
5019                     return;
5020                 }
5021 #endif
5022
5023                 cl_int retval = CL_SUCCESS;
5024                 if (!u->deviceMemMapped())
5025                 {
5026                     CV_Assert(u->refcount == 1);
5027                     CV_Assert(u->mapcount++ == 0);
5028                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5029                                                          (CL_MAP_READ | CL_MAP_WRITE),
5030                                                          0, u->size, 0, 0, 0, &retval);
5031                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5032                 }
5033                 if (u->data && retval == CL_SUCCESS)
5034                 {
5035                     u->markHostCopyObsolete(false);
5036                     u->markDeviceMemMapped(true);
5037                     return;
5038                 }
5039
5040                 // TODO Is it really a good idea and was it tested well?
5041                 // if map failed, switch to copy-on-map mode for the particular buffer
5042                 u->flags |= UMatData::COPY_ON_MAP;
5043             }
5044
5045             if(!u->data)
5046             {
5047                 u->data = (uchar*)fastMalloc(u->size);
5048                 u->markHostCopyObsolete(true);
5049             }
5050         }
5051
5052         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
5053         {
5054             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5055 #ifdef HAVE_OPENCL_SVM
5056             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5057 #endif
5058             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5059                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5060             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5061                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5062             u->markHostCopyObsolete(false);
5063         }
5064     }
5065
5066     void unmap(UMatData* u) const CV_OVERRIDE
5067     {
5068         if(!u)
5069             return;
5070
5071
5072         CV_Assert(u->handle != 0);
5073
5074         UMatDataAutoLock autolock(u);
5075
5076         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5077         cl_int retval = 0;
5078         if( !u->copyOnMap() && u->deviceMemMapped() )
5079         {
5080             CV_Assert(u->data != NULL);
5081 #ifdef HAVE_OPENCL_SVM
5082             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5083             {
5084                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5085                 {
5086                     Context& ctx = Context::getDefault();
5087                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5088                     CV_DbgAssert(svmFns->isValid());
5089
5090                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5091                     {
5092                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5093                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5094                                 0, NULL, NULL);
5095                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5096                         clFinish(q);
5097                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5098                     }
5099                 }
5100                 if (u->refcount == 0)
5101                     u->data = 0;
5102                 u->markDeviceCopyObsolete(false);
5103                 u->markHostCopyObsolete(true);
5104                 return;
5105             }
5106 #endif
5107             if (u->refcount == 0)
5108             {
5109                 CV_Assert(u->mapcount-- == 1);
5110                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5111                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5112                 if (Device::getDefault().isAMD())
5113                 {
5114                     // required for multithreaded applications (see stitching test)
5115                     CV_OCL_DBG_CHECK(clFinish(q));
5116                 }
5117                 u->markDeviceMemMapped(false);
5118                 u->data = 0;
5119                 u->markDeviceCopyObsolete(false);
5120                 u->markHostCopyObsolete(true);
5121             }
5122         }
5123         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5124         {
5125             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5126 #ifdef HAVE_OPENCL_SVM
5127             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5128 #endif
5129             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5130                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5131             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5132                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5133             u->markDeviceCopyObsolete(false);
5134             u->markHostCopyObsolete(true);
5135         }
5136     }
5137
5138     bool checkContinuous(int dims, const size_t sz[],
5139                          const size_t srcofs[], const size_t srcstep[],
5140                          const size_t dstofs[], const size_t dststep[],
5141                          size_t& total, size_t new_sz[],
5142                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5143                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5144     {
5145         bool iscontinuous = true;
5146         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5147         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5148         total = sz[dims-1];
5149         for( int i = dims-2; i >= 0; i-- )
5150         {
5151             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5152                 iscontinuous = false;
5153             total *= sz[i];
5154             if( srcofs )
5155                 srcrawofs += srcofs[i]*srcstep[i];
5156             if( dstofs )
5157                 dstrawofs += dstofs[i]*dststep[i];
5158         }
5159
5160         if( !iscontinuous )
5161         {
5162             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5163             if( dims == 2 )
5164             {
5165                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5166                 // we assume that new_... arrays are initialized by caller
5167                 // with 0's, so there is no else branch
5168                 if( srcofs )
5169                 {
5170                     new_srcofs[0] = srcofs[1];
5171                     new_srcofs[1] = srcofs[0];
5172                     new_srcofs[2] = 0;
5173                 }
5174
5175                 if( dstofs )
5176                 {
5177                     new_dstofs[0] = dstofs[1];
5178                     new_dstofs[1] = dstofs[0];
5179                     new_dstofs[2] = 0;
5180                 }
5181
5182                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5183                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5184             }
5185             else
5186             {
5187                 // we could check for dims == 3 here,
5188                 // but from user perspective this one is more informative
5189                 CV_Assert(dims <= 3);
5190                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5191                 if( srcofs )
5192                 {
5193                     new_srcofs[0] = srcofs[2];
5194                     new_srcofs[1] = srcofs[1];
5195                     new_srcofs[2] = srcofs[0];
5196                 }
5197
5198                 if( dstofs )
5199                 {
5200                     new_dstofs[0] = dstofs[2];
5201                     new_dstofs[1] = dstofs[1];
5202                     new_dstofs[2] = dstofs[0];
5203                 }
5204
5205                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5206                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5207             }
5208         }
5209         return iscontinuous;
5210     }
5211
5212     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5213                   const size_t srcofs[], const size_t srcstep[],
5214                   const size_t dststep[]) const CV_OVERRIDE
5215     {
5216         if(!u)
5217             return;
5218         UMatDataAutoLock autolock(u);
5219
5220         if( u->data && !u->hostCopyObsolete() )
5221         {
5222             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5223             return;
5224         }
5225         CV_Assert( u->handle != 0 );
5226
5227         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5228
5229         size_t total = 0, new_sz[] = {0, 0, 0};
5230         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5231         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5232
5233         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5234                                             total, new_sz,
5235                                             srcrawofs, new_srcofs, new_srcstep,
5236                                             dstrawofs, new_dstofs, new_dststep);
5237
5238 #ifdef HAVE_OPENCL_SVM
5239         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5240         {
5241             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5242             Context& ctx = Context::getDefault();
5243             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5244             CV_DbgAssert(svmFns->isValid());
5245
5246             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5247             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5248             {
5249                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5250                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5251                         u->handle, u->size,
5252                         0, NULL, NULL);
5253                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5254             }
5255             clFinish(q);
5256             if( iscontinuous )
5257             {
5258                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5259             }
5260             else
5261             {
5262                 // This code is from MatAllocator::download()
5263                 int isz[CV_MAX_DIM];
5264                 uchar* srcptr = (uchar*)u->handle;
5265                 for( int i = 0; i < dims; i++ )
5266                 {
5267                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5268                     if( sz[i] == 0 )
5269                     return;
5270                     if( srcofs )
5271                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5272                     isz[i] = (int)sz[i];
5273                 }
5274
5275                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5276                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5277
5278                 const Mat* arrays[] = { &src, &dst };
5279                 uchar* ptrs[2];
5280                 NAryMatIterator it(arrays, ptrs, 2);
5281                 size_t j, planesz = it.size;
5282
5283                 for( j = 0; j < it.nplanes; j++, ++it )
5284                     memcpy(ptrs[1], ptrs[0], planesz);
5285             }
5286             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5287             {
5288                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5289                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5290                         0, NULL, NULL);
5291                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5292                 clFinish(q);
5293             }
5294         }
5295         else
5296 #endif
5297         {
5298             if( iscontinuous )
5299             {
5300                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5301                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5302                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5303             }
5304             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5305             {
5306                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5307                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5308                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5309                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5310                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5311                 uchar* ptr = alignedPtr.getAlignedPtr();
5312
5313                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5314                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5315                 total = std::min(total, u->size - new_srcrawofs);
5316                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5317                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5318                 for( size_t i = 0; i < new_sz[1]; i++ )
5319                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5320             }
5321             else
5322             {
5323                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5324                 uchar* ptr = alignedPtr.getAlignedPtr();
5325
5326                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5327                     new_srcofs, new_dstofs, new_sz,
5328                     new_srcstep[0], 0,
5329                     new_dststep[0], 0,
5330                     ptr, 0, 0, 0));
5331             }
5332         }
5333     }
5334
5335     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5336                 const size_t dstofs[], const size_t dststep[],
5337                 const size_t srcstep[]) const CV_OVERRIDE
5338     {
5339         if(!u)
5340             return;
5341
5342         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5343         CV_Assert(u->refcount == 0 || u->tempUMat());
5344
5345         size_t total = 0, new_sz[] = {0, 0, 0};
5346         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5347         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5348
5349         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
5350                                             total, new_sz,
5351                                             srcrawofs, new_srcofs, new_srcstep,
5352                                             dstrawofs, new_dstofs, new_dststep);
5353
5354         UMatDataAutoLock autolock(u);
5355
5356         // if there is cached CPU copy of the GPU matrix,
5357         // we could use it as a destination.
5358         // we can do it in 2 cases:
5359         //    1. we overwrite the whole content
5360         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
5361         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
5362         {
5363             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
5364             u->markHostCopyObsolete(false);
5365             u->markDeviceCopyObsolete(true);
5366             return;
5367         }
5368
5369         CV_Assert( u->handle != 0 );
5370         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5371
5372 #ifdef HAVE_OPENCL_SVM
5373         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5374         {
5375             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5376             Context& ctx = Context::getDefault();
5377             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5378             CV_DbgAssert(svmFns->isValid());
5379
5380             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5381             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5382             {
5383                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5384                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
5385                         u->handle, u->size,
5386                         0, NULL, NULL);
5387                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5388             }
5389             clFinish(q);
5390             if( iscontinuous )
5391             {
5392                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
5393             }
5394             else
5395             {
5396                 // This code is from MatAllocator::upload()
5397                 int isz[CV_MAX_DIM];
5398                 uchar* dstptr = (uchar*)u->handle;
5399                 for( int i = 0; i < dims; i++ )
5400                 {
5401                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5402                     if( sz[i] == 0 )
5403                     return;
5404                     if( dstofs )
5405                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5406                     isz[i] = (int)sz[i];
5407                 }
5408
5409                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
5410                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5411
5412                 const Mat* arrays[] = { &src, &dst };
5413                 uchar* ptrs[2];
5414                 NAryMatIterator it(arrays, ptrs, 2);
5415                 size_t j, planesz = it.size;
5416
5417                 for( j = 0; j < it.nplanes; j++, ++it )
5418                     memcpy(ptrs[1], ptrs[0], planesz);
5419             }
5420             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5421             {
5422                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5423                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5424                         0, NULL, NULL);
5425                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5426                 clFinish(q);
5427             }
5428         }
5429         else
5430 #endif
5431         {
5432             if( iscontinuous )
5433             {
5434                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5435                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5436                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
5437                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
5438                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5439             }
5440             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5441             {
5442                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5443                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5444                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
5445                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
5446                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5447                 uchar* ptr = alignedPtr.getAlignedPtr();
5448
5449                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5450                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
5451                 total = std::min(total, u->size - new_dstrawofs);
5452                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
5453                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
5454                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
5455                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5456                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5457                 for( size_t i = 0; i < new_sz[1]; i++ )
5458                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
5459                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5460                                                  new_dstrawofs, total, ptr, 0, 0, 0));
5461             }
5462             else
5463             {
5464                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5465                 uchar* ptr = alignedPtr.getAlignedPtr();
5466
5467                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5468                     new_dstofs, new_srcofs, new_sz,
5469                     new_dststep[0], 0,
5470                     new_srcstep[0], 0,
5471                     ptr, 0, 0, 0));
5472             }
5473         }
5474         u->markHostCopyObsolete(true);
5475 #ifdef HAVE_OPENCL_SVM
5476         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5477                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5478         {
5479             // nothing
5480         }
5481         else
5482 #endif
5483         {
5484             u->markHostCopyObsolete(true);
5485         }
5486         u->markDeviceCopyObsolete(false);
5487     }
5488
5489     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
5490               const size_t srcofs[], const size_t srcstep[],
5491               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
5492     {
5493         if(!src || !dst)
5494             return;
5495
5496         size_t total = 0, new_sz[] = {0, 0, 0};
5497         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5498         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5499
5500         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
5501                                             total, new_sz,
5502                                             srcrawofs, new_srcofs, new_srcstep,
5503                                             dstrawofs, new_dstofs, new_dststep);
5504
5505         UMatDataAutoLock src_autolock(src, dst);
5506
5507         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
5508         {
5509             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5510             return;
5511         }
5512         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
5513         {
5514             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5515             dst->markHostCopyObsolete(false);
5516 #ifdef HAVE_OPENCL_SVM
5517             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5518                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5519             {
5520                 // nothing
5521             }
5522             else
5523 #endif
5524             {
5525                 dst->markDeviceCopyObsolete(true);
5526             }
5527             return;
5528         }
5529
5530         // there should be no user-visible CPU copies of the UMat which we are going to copy to
5531         CV_Assert(dst->refcount == 0);
5532         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5533
5534         cl_int retval = CL_SUCCESS;
5535 #ifdef HAVE_OPENCL_SVM
5536         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
5537                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5538         {
5539             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
5540                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5541             {
5542                 Context& ctx = Context::getDefault();
5543                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5544                 CV_DbgAssert(svmFns->isValid());
5545
5546                 if( iscontinuous )
5547                 {
5548                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
5549                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
5550                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
5551                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
5552                             total, 0, NULL, NULL);
5553                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
5554                 }
5555                 else
5556                 {
5557                     clFinish(q);
5558                     // This code is from MatAllocator::download()/upload()
5559                     int isz[CV_MAX_DIM];
5560                     uchar* srcptr = (uchar*)src->handle;
5561                     for( int i = 0; i < dims; i++ )
5562                     {
5563                         CV_Assert( sz[i] <= (size_t)INT_MAX );
5564                         if( sz[i] == 0 )
5565                         return;
5566                         if( srcofs )
5567                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5568                         isz[i] = (int)sz[i];
5569                     }
5570                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
5571
5572                     uchar* dstptr = (uchar*)dst->handle;
5573                     for( int i = 0; i < dims; i++ )
5574                     {
5575                         if( dstofs )
5576                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
5577                     }
5578                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
5579
5580                     const Mat* arrays[] = { &m_src, &m_dst };
5581                     uchar* ptrs[2];
5582                     NAryMatIterator it(arrays, ptrs, 2);
5583                     size_t j, planesz = it.size;
5584
5585                     for( j = 0; j < it.nplanes; j++, ++it )
5586                         memcpy(ptrs[1], ptrs[0], planesz);
5587                 }
5588             }
5589             else
5590             {
5591                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5592                 {
5593                     map(src, ACCESS_READ);
5594                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
5595                     unmap(src);
5596                 }
5597                 else
5598                 {
5599                     map(dst, ACCESS_WRITE);
5600                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
5601                     unmap(dst);
5602                 }
5603             }
5604         }
5605         else
5606 #endif
5607         {
5608             if( iscontinuous )
5609             {
5610                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5611                                                srcrawofs, dstrawofs, total, 0, 0, 0);
5612                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
5613                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
5614             }
5615             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5616             {
5617                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5618                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5619                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
5620                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
5621                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
5622
5623                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5624                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5625                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
5626                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5627                 uchar* srcptr = srcBuf.getAlignedPtr();
5628                 uchar* dstptr = dstBuf.getAlignedPtr();
5629
5630                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
5631
5632                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
5633                 src_total = std::min(src_total, src->size - new_srcrawofs);
5634                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
5635                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
5636
5637                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
5638                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
5639                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5640                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5641
5642                 for( size_t i = 0; i < new_sz[1]; i++ )
5643                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
5644                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
5645                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
5646                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
5647             }
5648             else
5649             {
5650                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
5651                                                    new_srcofs, new_dstofs, new_sz,
5652                                                    new_srcstep[0], 0,
5653                                                    new_dststep[0], 0,
5654                                                    0, 0, 0));
5655             }
5656         }
5657         if (retval == CL_SUCCESS)
5658         {
5659             CV_IMPL_ADD(CV_IMPL_OCL)
5660         }
5661
5662 #ifdef HAVE_OPENCL_SVM
5663         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5664             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5665         {
5666             // nothing
5667         }
5668         else
5669 #endif
5670         {
5671             dst->markHostCopyObsolete(true);
5672         }
5673         dst->markDeviceCopyObsolete(false);
5674
5675         if( _sync )
5676         {
5677             CV_OCL_DBG_CHECK(clFinish(q));
5678         }
5679     }
5680
5681     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE {
5682 #ifdef HAVE_OPENCL_SVM
5683         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
5684         {
5685             return &bufferPoolSVM;
5686         }
5687 #endif
5688         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
5689         {
5690             return &bufferPoolHostPtr;
5691         }
5692         if (id != NULL && strcmp(id, "OCL") != 0)
5693         {
5694             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
5695         }
5696         return &bufferPool;
5697     }
5698
5699     MatAllocator* matStdAllocator;
5700
5701     mutable cv::Mutex cleanupQueueMutex;
5702     mutable std::deque<UMatData*> cleanupQueue;
5703
5704     void flushCleanupQueue() const
5705     {
5706         if (!cleanupQueue.empty())
5707         {
5708             std::deque<UMatData*> q;
5709             {
5710                 cv::AutoLock lock(cleanupQueueMutex);
5711                 q.swap(cleanupQueue);
5712             }
5713             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
5714             {
5715                 deallocate_(*i);
5716             }
5717         }
5718     }
5719     void addToCleanupQueue(UMatData* u) const
5720     {
5721         //TODO: Validation check: CV_Assert(!u->tempUMat());
5722         {
5723             cv::AutoLock lock(cleanupQueueMutex);
5724             cleanupQueue.push_back(u);
5725         }
5726     }
5727 };
5728
5729 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
5730 {
5731     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
5732     g_isOpenCVActivated = true;
5733     return g_allocator;
5734 }
5735 MatAllocator* getOpenCLAllocator()
5736 {
5737     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
5738 }
5739
5740 }} // namespace cv::ocl
5741
5742
5743 namespace cv {
5744
5745 // three funcs below are implemented in umatrix.cpp
5746 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
5747               bool autoSteps = false );
5748 void finalizeHdr(UMat& m);
5749
5750 } // namespace cv
5751
5752
5753 namespace cv { namespace ocl {
5754
5755 /*
5756 // Convert OpenCL buffer memory to UMat
5757 */
5758 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
5759 {
5760     int d = 2;
5761     int sizes[] = { rows, cols };
5762
5763     CV_Assert(0 <= d && d <= CV_MAX_DIM);
5764
5765     dst.release();
5766
5767     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
5768     dst.usageFlags = USAGE_DEFAULT;
5769
5770     setSize(dst, d, sizes, 0, true);
5771     dst.offset = 0;
5772
5773     cl_mem             memobj = (cl_mem)cl_mem_buffer;
5774     cl_mem_object_type mem_type = 0;
5775
5776     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5777
5778     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
5779
5780     size_t total = 0;
5781     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
5782
5783     CV_OCL_CHECK(clRetainMemObject(memobj));
5784
5785     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
5786     CV_Assert(total >= rows * step);
5787
5788     // attach clBuffer to UMatData
5789     dst.u = new UMatData(getOpenCLAllocator());
5790     dst.u->data            = 0;
5791     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
5792     dst.u->flags           = 0;
5793     dst.u->handle          = cl_mem_buffer;
5794     dst.u->origdata        = 0;
5795     dst.u->prevAllocator   = 0;
5796     dst.u->size            = total;
5797
5798     finalizeHdr(dst);
5799     dst.addref();
5800
5801     return;
5802 } // convertFromBuffer()
5803
5804
5805 /*
5806 // Convert OpenCL image2d_t memory to UMat
5807 */
5808 void convertFromImage(void* cl_mem_image, UMat& dst)
5809 {
5810     cl_mem             clImage = (cl_mem)cl_mem_image;
5811     cl_mem_object_type mem_type = 0;
5812
5813     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
5814
5815     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
5816
5817     cl_image_format fmt = { 0, 0 };
5818     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
5819
5820     int depth = CV_8U;
5821     switch (fmt.image_channel_data_type)
5822     {
5823     case CL_UNORM_INT8:
5824     case CL_UNSIGNED_INT8:
5825         depth = CV_8U;
5826         break;
5827
5828     case CL_SNORM_INT8:
5829     case CL_SIGNED_INT8:
5830         depth = CV_8S;
5831         break;
5832
5833     case CL_UNORM_INT16:
5834     case CL_UNSIGNED_INT16:
5835         depth = CV_16U;
5836         break;
5837
5838     case CL_SNORM_INT16:
5839     case CL_SIGNED_INT16:
5840         depth = CV_16S;
5841         break;
5842
5843     case CL_SIGNED_INT32:
5844         depth = CV_32S;
5845         break;
5846
5847     case CL_FLOAT:
5848         depth = CV_32F;
5849         break;
5850
5851     default:
5852         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
5853     }
5854
5855     int type = CV_8UC1;
5856     switch (fmt.image_channel_order)
5857     {
5858     case CL_R:
5859         type = CV_MAKE_TYPE(depth, 1);
5860         break;
5861
5862     case CL_RGBA:
5863     case CL_BGRA:
5864     case CL_ARGB:
5865         type = CV_MAKE_TYPE(depth, 4);
5866         break;
5867
5868     default:
5869         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
5870         break;
5871     }
5872
5873     size_t step = 0;
5874     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
5875
5876     size_t w = 0;
5877     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
5878
5879     size_t h = 0;
5880     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
5881
5882     dst.create((int)h, (int)w, type);
5883
5884     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
5885
5886     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5887
5888     size_t offset = 0;
5889     size_t src_origin[3] = { 0, 0, 0 };
5890     size_t region[3] = { w, h, 1 };
5891     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
5892
5893     CV_OCL_CHECK(clFinish(q));
5894
5895     return;
5896 } // convertFromImage()
5897
5898
5899 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
5900
5901 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
5902 {
5903     cl_uint numDevices = 0;
5904     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
5905     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
5906     {
5907         CV_OCL_DBG_CHECK_RESULT(status,
5908             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
5909     }
5910
5911     if (numDevices == 0)
5912     {
5913         devices.clear();
5914         return;
5915     }
5916
5917     devices.resize((size_t)numDevices);
5918     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
5919 }
5920
5921 struct PlatformInfo::Impl
5922 {
5923     Impl(void* id)
5924     {
5925         refcount = 1;
5926         handle = *(cl_platform_id*)id;
5927         getDevices(devices, handle);
5928     }
5929
5930     String getStrProp(cl_platform_info prop) const
5931     {
5932         char buf[1024];
5933         size_t sz=0;
5934         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
5935             sz < sizeof(buf) ? String(buf) : String();
5936     }
5937
5938     IMPLEMENT_REFCOUNTABLE();
5939     std::vector<cl_device_id> devices;
5940     cl_platform_id handle;
5941 };
5942
5943 PlatformInfo::PlatformInfo()
5944 {
5945     p = 0;
5946 }
5947
5948 PlatformInfo::PlatformInfo(void* platform_id)
5949 {
5950     p = new Impl(platform_id);
5951 }
5952
5953 PlatformInfo::~PlatformInfo()
5954 {
5955     if(p)
5956         p->release();
5957 }
5958
5959 PlatformInfo::PlatformInfo(const PlatformInfo& i)
5960 {
5961     if (i.p)
5962         i.p->addref();
5963     p = i.p;
5964 }
5965
5966 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
5967 {
5968     if (i.p != p)
5969     {
5970         if (i.p)
5971             i.p->addref();
5972         if (p)
5973             p->release();
5974         p = i.p;
5975     }
5976     return *this;
5977 }
5978
5979 int PlatformInfo::deviceNumber() const
5980 {
5981     return p ? (int)p->devices.size() : 0;
5982 }
5983
5984 void PlatformInfo::getDevice(Device& device, int d) const
5985 {
5986     CV_Assert(p && d < (int)p->devices.size() );
5987     if(p)
5988         device.set(p->devices[d]);
5989 }
5990
5991 String PlatformInfo::name() const
5992 {
5993     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
5994 }
5995
5996 String PlatformInfo::vendor() const
5997 {
5998     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
5999 }
6000
6001 String PlatformInfo::version() const
6002 {
6003     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
6004 }
6005
6006 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6007 {
6008     cl_uint numPlatforms = 0;
6009     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6010
6011     if (numPlatforms == 0)
6012     {
6013         platforms.clear();
6014         return;
6015     }
6016
6017     platforms.resize((size_t)numPlatforms);
6018     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6019 }
6020
6021 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6022 {
6023     std::vector<cl_platform_id> platforms;
6024     getPlatforms(platforms);
6025
6026     for (size_t i = 0; i < platforms.size(); i++)
6027         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6028 }
6029
6030 const char* typeToStr(int type)
6031 {
6032     static const char* tab[]=
6033     {
6034         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6035         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6036         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6037         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6038         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6039         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6040         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6041         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6042     };
6043     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6044     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6045 }
6046
6047 const char* memopTypeToStr(int type)
6048 {
6049     static const char* tab[] =
6050     {
6051         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6052         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6053         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6054         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6055         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6056         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6057         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6058         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6059     };
6060     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6061     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6062 }
6063
6064 const char* vecopTypeToStr(int type)
6065 {
6066     static const char* tab[] =
6067     {
6068         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6069         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6070         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6071         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6072         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6073         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6074         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6075         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
6076     };
6077     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6078     return cn > 16 ? "?" : tab[depth*16 + cn-1];
6079 }
6080
6081 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6082 {
6083     if( sdepth == ddepth )
6084         return "noconvert";
6085     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6086     if( ddepth >= CV_32F ||
6087         (ddepth == CV_32S && sdepth < CV_32S) ||
6088         (ddepth == CV_16S && sdepth <= CV_8S) ||
6089         (ddepth == CV_16U && sdepth == CV_8U))
6090     {
6091         sprintf(buf, "convert_%s", typestr);
6092     }
6093     else if( sdepth >= CV_32F )
6094         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6095     else
6096         sprintf(buf, "convert_%s_sat", typestr);
6097
6098     return buf;
6099 }
6100
6101 const char* getOpenCLErrorString(int errorCode)
6102 {
6103 #define CV_OCL_CODE(id) case id: return #id
6104 #define CV_OCL_CODE_(id, name) case id: return #name
6105     switch (errorCode)
6106     {
6107     CV_OCL_CODE(CL_SUCCESS);
6108     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6109     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6110     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6111     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6112     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6113     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6114     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6115     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6116     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6117     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6118     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6119     CV_OCL_CODE(CL_MAP_FAILURE);
6120     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6121     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6122     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6123     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6124     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6125     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6126     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6127     CV_OCL_CODE(CL_INVALID_VALUE);
6128     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6129     CV_OCL_CODE(CL_INVALID_PLATFORM);
6130     CV_OCL_CODE(CL_INVALID_DEVICE);
6131     CV_OCL_CODE(CL_INVALID_CONTEXT);
6132     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6133     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6134     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6135     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6136     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6137     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6138     CV_OCL_CODE(CL_INVALID_SAMPLER);
6139     CV_OCL_CODE(CL_INVALID_BINARY);
6140     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6141     CV_OCL_CODE(CL_INVALID_PROGRAM);
6142     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6143     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6144     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6145     CV_OCL_CODE(CL_INVALID_KERNEL);
6146     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6147     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6148     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6149     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6150     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6151     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6152     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6153     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6154     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6155     CV_OCL_CODE(CL_INVALID_EVENT);
6156     CV_OCL_CODE(CL_INVALID_OPERATION);
6157     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6158     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6159     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6160     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6161     // OpenCL 1.1
6162     CV_OCL_CODE(CL_INVALID_PROPERTY);
6163     // OpenCL 1.2
6164     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6165     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6166     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6167     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6168     // OpenCL 2.0
6169     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6170     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6171     // Extensions
6172     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6173     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6174     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6175     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6176     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6177     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6178     default: return "Unknown OpenCL error";
6179     }
6180 #undef CV_OCL_CODE
6181 #undef CV_OCL_CODE_
6182 }
6183
6184 template <typename T>
6185 static std::string kerToStr(const Mat & k)
6186 {
6187     int width = k.cols - 1, depth = k.depth();
6188     const T * const data = k.ptr<T>();
6189
6190     std::ostringstream stream;
6191     stream.precision(10);
6192
6193     if (depth <= CV_8S)
6194     {
6195         for (int i = 0; i < width; ++i)
6196             stream << "DIG(" << (int)data[i] << ")";
6197         stream << "DIG(" << (int)data[width] << ")";
6198     }
6199     else if (depth == CV_32F)
6200     {
6201         stream.setf(std::ios_base::showpoint);
6202         for (int i = 0; i < width; ++i)
6203             stream << "DIG(" << data[i] << "f)";
6204         stream << "DIG(" << data[width] << "f)";
6205     }
6206     else
6207     {
6208         for (int i = 0; i < width; ++i)
6209             stream << "DIG(" << data[i] << ")";
6210         stream << "DIG(" << data[width] << ")";
6211     }
6212
6213     return stream.str();
6214 }
6215
6216 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6217 {
6218     Mat kernel = _kernel.getMat().reshape(1, 1);
6219
6220     int depth = kernel.depth();
6221     if (ddepth < 0)
6222         ddepth = depth;
6223
6224     if (ddepth != depth)
6225         kernel.convertTo(kernel, ddepth);
6226
6227     typedef std::string (* func_t)(const Mat &);
6228     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6229                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6230     const func_t func = funcs[ddepth];
6231     CV_Assert(func != 0);
6232
6233     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6234 }
6235
6236 #define PROCESS_SRC(src) \
6237     do \
6238     { \
6239         if (!src.empty()) \
6240         { \
6241             CV_Assert(src.isMat() || src.isUMat()); \
6242             Size csize = src.size(); \
6243             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6244                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6245             if (cwidth < ckercn || ckercn <= 0) \
6246                 return 1; \
6247             cols.push_back(cwidth); \
6248             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6249                 return 1; \
6250             offsets.push_back(src.offset()); \
6251             steps.push_back(src.step()); \
6252             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6253             kercns.push_back(ckercn); \
6254         } \
6255     } \
6256     while ((void)0, 0)
6257
6258 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6259                               InputArray src4, InputArray src5, InputArray src6,
6260                               InputArray src7, InputArray src8, InputArray src9,
6261                               OclVectorStrategy strat)
6262 {
6263     const ocl::Device & d = ocl::Device::getDefault();
6264
6265     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6266         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6267         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6268         d.preferredVectorWidthDouble(), -1 };
6269
6270     // if the device says don't use vectors
6271     if (vectorWidths[0] == 1)
6272     {
6273         // it's heuristic
6274         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6275         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6276         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6277     }
6278
6279     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6280 }
6281
6282 int checkOptimalVectorWidth(const int *vectorWidths,
6283                             InputArray src1, InputArray src2, InputArray src3,
6284                             InputArray src4, InputArray src5, InputArray src6,
6285                             InputArray src7, InputArray src8, InputArray src9,
6286                             OclVectorStrategy strat)
6287 {
6288     CV_Assert(vectorWidths);
6289
6290     int ref_type = src1.type();
6291
6292     std::vector<size_t> offsets, steps, cols;
6293     std::vector<int> dividers, kercns;
6294     PROCESS_SRC(src1);
6295     PROCESS_SRC(src2);
6296     PROCESS_SRC(src3);
6297     PROCESS_SRC(src4);
6298     PROCESS_SRC(src5);
6299     PROCESS_SRC(src6);
6300     PROCESS_SRC(src7);
6301     PROCESS_SRC(src8);
6302     PROCESS_SRC(src9);
6303
6304     size_t size = offsets.size();
6305
6306     for (size_t i = 0; i < size; ++i)
6307         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6308             dividers[i] >>= 1, kercns[i] >>= 1;
6309
6310     // default strategy
6311     int kercn = *std::min_element(kercns.begin(), kercns.end());
6312
6313     return kercn;
6314 }
6315
6316 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
6317                                  InputArray src4, InputArray src5, InputArray src6,
6318                                  InputArray src7, InputArray src8, InputArray src9)
6319 {
6320     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
6321 }
6322
6323 #undef PROCESS_SRC
6324
6325
6326 // TODO Make this as a method of OpenCL "BuildOptions" class
6327 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
6328 {
6329     if (!buildOptions.empty())
6330         buildOptions += " ";
6331     int type = _m.type(), depth = CV_MAT_DEPTH(type);
6332     buildOptions += format(
6333             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
6334             name.c_str(), ocl::typeToStr(type),
6335             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
6336             name.c_str(), (int)CV_MAT_CN(type),
6337             name.c_str(), (int)CV_ELEM_SIZE(type),
6338             name.c_str(), (int)CV_ELEM_SIZE1(type),
6339             name.c_str(), (int)depth
6340             );
6341 }
6342
6343
6344 struct Image2D::Impl
6345 {
6346     Impl(const UMat &src, bool norm, bool alias)
6347     {
6348         handle = 0;
6349         refcount = 1;
6350         init(src, norm, alias);
6351     }
6352
6353     ~Impl()
6354     {
6355         if (handle)
6356             clReleaseMemObject(handle);
6357     }
6358
6359     static cl_image_format getImageFormat(int depth, int cn, bool norm)
6360     {
6361         cl_image_format format;
6362         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
6363                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
6364         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
6365                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
6366         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
6367
6368         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
6369         int channelOrder = channelOrders[cn];
6370         format.image_channel_data_type = (cl_channel_type)channelType;
6371         format.image_channel_order = (cl_channel_order)channelOrder;
6372         return format;
6373     }
6374
6375     static bool isFormatSupported(cl_image_format format)
6376     {
6377         if (!haveOpenCL())
6378             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6379
6380         cl_context context = (cl_context)Context::getDefault().ptr();
6381         if (!context)
6382             return false;
6383
6384         // Figure out how many formats are supported by this context.
6385         cl_uint numFormats = 0;
6386         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6387                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
6388                                                 NULL, &numFormats);
6389         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
6390         if (numFormats > 0)
6391         {
6392             AutoBuffer<cl_image_format> formats(numFormats);
6393             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
6394                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
6395                                              formats.data(), NULL);
6396             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
6397             for (cl_uint i = 0; i < numFormats; ++i)
6398             {
6399                 if (!memcmp(&formats[i], &format, sizeof(format)))
6400                 {
6401                     return true;
6402                 }
6403             }
6404         }
6405         return false;
6406     }
6407
6408     void init(const UMat &src, bool norm, bool alias)
6409     {
6410         if (!haveOpenCL())
6411             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
6412
6413         CV_Assert(!src.empty());
6414         CV_Assert(ocl::Device::getDefault().imageSupport());
6415
6416         int err, depth = src.depth(), cn = src.channels();
6417         CV_Assert(cn <= 4);
6418         cl_image_format format = getImageFormat(depth, cn, norm);
6419
6420         if (!isFormatSupported(format))
6421             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
6422
6423         if (alias && !src.handle(ACCESS_RW))
6424             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
6425
6426         cl_context context = (cl_context)Context::getDefault().ptr();
6427         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
6428
6429 #ifdef CL_VERSION_1_2
6430         // this enables backwards portability to
6431         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
6432         const Device & d = ocl::Device::getDefault();
6433         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
6434         CV_Assert(!alias || canCreateAlias(src));
6435         if (1 < major || (1 == major && 2 <= minor))
6436         {
6437             cl_image_desc desc;
6438             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
6439             desc.image_width      = src.cols;
6440             desc.image_height     = src.rows;
6441             desc.image_depth      = 0;
6442             desc.image_array_size = 1;
6443             desc.image_row_pitch  = alias ? src.step[0] : 0;
6444             desc.image_slice_pitch = 0;
6445             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
6446             desc.num_mip_levels   = 0;
6447             desc.num_samples      = 0;
6448             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
6449         }
6450         else
6451 #endif
6452         {
6453             CV_SUPPRESS_DEPRECATED_START
6454             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
6455             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
6456             CV_SUPPRESS_DEPRECATED_END
6457         }
6458         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
6459
6460         size_t origin[] = { 0, 0, 0 };
6461         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
6462
6463         cl_mem devData;
6464         if (!alias && !src.isContinuous())
6465         {
6466             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
6467             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
6468                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
6469                 ).c_str());
6470
6471             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
6472             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
6473                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
6474             CV_OCL_DBG_CHECK(clFlush(queue));
6475         }
6476         else
6477         {
6478             devData = (cl_mem)src.handle(ACCESS_READ);
6479         }
6480         CV_Assert(devData != NULL);
6481
6482         if (!alias)
6483         {
6484             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
6485             if (!src.isContinuous())
6486             {
6487                 CV_OCL_DBG_CHECK(clFlush(queue));
6488                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
6489             }
6490         }
6491     }
6492
6493     IMPLEMENT_REFCOUNTABLE();
6494
6495     cl_mem handle;
6496 };
6497
6498 Image2D::Image2D()
6499 {
6500     p = NULL;
6501 }
6502
6503 Image2D::Image2D(const UMat &src, bool norm, bool alias)
6504 {
6505     p = new Impl(src, norm, alias);
6506 }
6507
6508 bool Image2D::canCreateAlias(const UMat &m)
6509 {
6510     bool ret = false;
6511     const Device & d = ocl::Device::getDefault();
6512     if (d.imageFromBufferSupport() && !m.empty())
6513     {
6514         // This is the required pitch alignment in pixels
6515         uint pitchAlign = d.imagePitchAlignment();
6516         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
6517         {
6518             // We don't currently handle the case where the buffer was created
6519             // with CL_MEM_USE_HOST_PTR
6520             if (!m.u->tempUMat())
6521             {
6522                 ret = true;
6523             }
6524         }
6525     }
6526     return ret;
6527 }
6528
6529 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
6530 {
6531     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
6532
6533     return Impl::isFormatSupported(format);
6534 }
6535
6536 Image2D::Image2D(const Image2D & i)
6537 {
6538     p = i.p;
6539     if (p)
6540         p->addref();
6541 }
6542
6543 Image2D & Image2D::operator = (const Image2D & i)
6544 {
6545     if (i.p != p)
6546     {
6547         if (i.p)
6548             i.p->addref();
6549         if (p)
6550             p->release();
6551         p = i.p;
6552     }
6553     return *this;
6554 }
6555
6556 Image2D::~Image2D()
6557 {
6558     if (p)
6559         p->release();
6560 }
6561
6562 void* Image2D::ptr() const
6563 {
6564     return p ? p->handle : 0;
6565 }
6566
6567 bool internal::isOpenCLForced()
6568 {
6569     static bool initialized = false;
6570     static bool value = false;
6571     if (!initialized)
6572     {
6573         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
6574         initialized = true;
6575     }
6576     return value;
6577 }
6578
6579 bool internal::isPerformanceCheckBypassed()
6580 {
6581     static bool initialized = false;
6582     static bool value = false;
6583     if (!initialized)
6584     {
6585         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
6586         initialized = true;
6587     }
6588     return value;
6589 }
6590
6591 bool internal::isCLBuffer(UMat& u)
6592 {
6593     void* h = u.handle(ACCESS_RW);
6594     if (!h)
6595         return true;
6596     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
6597 #if 1
6598     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
6599         return false;
6600 #else
6601     cl_mem_object_type type = 0;
6602     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
6603     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
6604         return false;
6605 #endif
6606     return true;
6607 }
6608
6609 struct Timer::Impl
6610 {
6611     const Queue queue;
6612
6613     Impl(const Queue& q)
6614         : queue(q)
6615     {
6616     }
6617
6618     ~Impl(){}
6619
6620     void start()
6621     {
6622         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6623         timer.start();
6624     }
6625
6626     void stop()
6627     {
6628         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
6629         timer.stop();
6630     }
6631
6632     uint64 durationNS() const
6633     {
6634         return (uint64)(timer.getTimeSec() * 1e9);
6635     }
6636
6637     TickMeter timer;
6638 };
6639
6640 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
6641 Timer::~Timer() { delete p; }
6642
6643 void Timer::start()
6644 {
6645     CV_Assert(p);
6646     p->start();
6647 }
6648
6649 void Timer::stop()
6650 {
6651     CV_Assert(p);
6652     p->stop();
6653 }
6654
6655 uint64 Timer::durationNS() const
6656 {
6657     CV_Assert(p);
6658     return p->durationNS();
6659 }
6660
6661 }} // namespace
6662
6663 #endif // HAVE_OPENCL