Merge remote-tracking branch 'upstream/3.4' into merge-3.4
[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43
44 #ifndef HAVE_OPENCL
45 #include "ocl_disabled.impl.hpp"
46 #else // HAVE_OPENCL
47
48 #include <list>
49 #include <map>
50 #include <deque>
51 #include <set>
52 #include <string>
53 #include <sstream>
54 #include <iostream> // std::cerr
55 #include <fstream>
56 #if !(defined _MSC_VER) || (defined _MSC_VER && _MSC_VER > 1700)
57 #include <inttypes.h>
58 #endif
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61
62 #include <opencv2/core/utils/logger.defines.hpp>
63 #undef CV_LOG_STRIP_LEVEL
64 #define CV_LOG_STRIP_LEVEL CV_LOG_LEVEL_DEBUG + 1
65 #include <opencv2/core/utils/logger.hpp>
66
67 #include "opencv2/core/ocl_genbase.hpp"
68 #include "opencl_kernels_core.hpp"
69
70 #include "opencv2/core/utils/lock.private.hpp"
71 #include "opencv2/core/utils/filesystem.hpp"
72 #include "opencv2/core/utils/filesystem.private.hpp"
73
74 #define CV__ALLOCATOR_STATS_LOG(...) CV_LOG_VERBOSE(NULL, 0, "OpenCL allocator: " << __VA_ARGS__)
75 #include "opencv2/core/utils/allocator_stats.impl.hpp"
76 #undef CV__ALLOCATOR_STATS_LOG
77
78 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG          0
79
80 #define CV_OPENCL_SHOW_RUN_KERNELS               0
81 #define CV_OPENCL_TRACE_CHECK                    0
82
83 #define CV_OPENCL_VALIDATE_BINARY_PROGRAMS       1
84
85 #define CV_OPENCL_SHOW_SVM_ERROR_LOG             1
86 #define CV_OPENCL_SHOW_SVM_LOG                   0
87
88 #include "opencv2/core/bufferpool.hpp"
89 #ifndef LOG_BUFFER_POOL
90 # if 0
91 #   define LOG_BUFFER_POOL printf
92 # else
93 #   define LOG_BUFFER_POOL(...)
94 # endif
95 #endif
96
97 #if CV_OPENCL_SHOW_SVM_LOG
98 // TODO add timestamp logging
99 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
100 #else
101 #define CV_OPENCL_SVM_TRACE_P(...)
102 #endif
103
104 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
105 // TODO add timestamp logging
106 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
107 #else
108 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
109 #endif
110
111 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
112 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
113
114 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
115
116 #ifdef HAVE_DIRECTX
117 #include "directx.hpp"
118 #endif
119
120 #ifdef HAVE_OPENCL_SVM
121 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
122 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
123 #include "opencv2/core/opencl/opencl_svm.hpp"
124 #endif
125
126 #include "umatrix.hpp"
127
128 namespace cv { namespace ocl {
129
130 #define IMPLEMENT_REFCOUNTABLE() \
131     void addref() { CV_XADD(&refcount, 1); } \
132     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
133     int refcount
134
135 static cv::utils::AllocatorStatistics opencl_allocator_stats;
136
137 CV_EXPORTS cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics();
138 cv::utils::AllocatorStatisticsInterface& getOpenCLAllocatorStatistics()
139 {
140     return opencl_allocator_stats;
141 }
142
143 #ifndef _DEBUG
144 static bool isRaiseError()
145 {
146     static bool initialized = false;
147     static bool value = false;
148     if (!initialized)
149     {
150         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR", false);
151         initialized = true;
152     }
153     return value;
154 }
155 #endif
156
157 #if CV_OPENCL_TRACE_CHECK
158 static inline
159 void traceOpenCLCheck(cl_int status, const char* message)
160 {
161     std::cout << "OpenCV(OpenCL:" << status << "): " << message << std::endl << std::flush;
162 }
163 #define CV_OCL_TRACE_CHECK_RESULT(status, message) traceOpenCLCheck(status, message)
164 #else
165 #define CV_OCL_TRACE_CHECK_RESULT(status, message) /* nothing */
166 #endif
167
168 #define CV_OCL_API_ERROR_MSG(check_result, msg) \
169     cv::format("OpenCL error %s (%d) during call: %s", getOpenCLErrorString(check_result), check_result, msg)
170
171 #define CV_OCL_CHECK_RESULT(check_result, msg) \
172     do { \
173         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
174         if (check_result != CL_SUCCESS) \
175         { \
176             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
177             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
178             CV_Error(Error::OpenCLApiCallError, error_msg); \
179         } \
180     } while (0)
181
182 #define CV_OCL_CHECK_(expr, check_result) do { expr; CV_OCL_CHECK_RESULT(check_result, #expr); } while (0)
183
184 #define CV_OCL_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_CHECK_RESULT(__cl_result, #expr); } while (0)
185
186 #ifdef _DEBUG
187 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) CV_OCL_CHECK_RESULT(check_result, msg)
188 #define CV_OCL_DBG_CHECK(expr) CV_OCL_CHECK(expr)
189 #define CV_OCL_DBG_CHECK_(expr, check_result) CV_OCL_CHECK_(expr, check_result)
190 #else
191 #define CV_OCL_DBG_CHECK_RESULT(check_result, msg) \
192     do { \
193         CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
194         if (check_result != CL_SUCCESS && isRaiseError()) \
195         { \
196             static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
197             cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
198             CV_Error(Error::OpenCLApiCallError, error_msg); \
199         } \
200     } while (0)
201 #define CV_OCL_DBG_CHECK_(expr, check_result) do { expr; CV_OCL_DBG_CHECK_RESULT(check_result, #expr); } while (0)
202 #define CV_OCL_DBG_CHECK(expr) do { cl_int __cl_result = (expr); CV_OCL_DBG_CHECK_RESULT(__cl_result, #expr); } while (0)
203 #endif
204
205
206 static const bool CV_OPENCL_CACHE_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_ENABLE", true);
207 static const bool CV_OPENCL_CACHE_WRITE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_WRITE", true);
208 static const bool CV_OPENCL_CACHE_LOCK_ENABLE = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_LOCK_ENABLE", true);
209 static const bool CV_OPENCL_CACHE_CLEANUP = utils::getConfigurationParameterBool("OPENCV_OPENCL_CACHE_CLEANUP", true);
210
211 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
212 static const bool CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE = utils::getConfigurationParameterBool("OPENCV_OPENCL_VALIDATE_BINARY_PROGRAMS", false);
213 #endif
214
215 // Option to disable calls clEnqueueReadBufferRect / clEnqueueWriteBufferRect / clEnqueueCopyBufferRect
216 static const bool CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS = utils::getConfigurationParameterBool("OPENCV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS",
217 #ifdef __APPLE__
218         true
219 #else
220         false
221 #endif
222 );
223
224 static const String getBuildExtraOptions()
225 {
226     static String param_buildExtraOptions;
227     static bool initialized = false;
228     if (!initialized)
229     {
230         param_buildExtraOptions = utils::getConfigurationParameterString("OPENCV_OPENCL_BUILD_EXTRA_OPTIONS", "");
231         initialized = true;
232         if (!param_buildExtraOptions.empty())
233             CV_LOG_WARNING(NULL, "OpenCL: using extra build options: '" << param_buildExtraOptions << "'");
234     }
235     return param_buildExtraOptions;
236 }
237
238 static const bool CV_OPENCL_ENABLE_MEM_USE_HOST_PTR = utils::getConfigurationParameterBool("OPENCV_OPENCL_ENABLE_MEM_USE_HOST_PTR", true);
239 static const size_t CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR", 4);
240
241
242 struct UMat2D
243 {
244     UMat2D(const UMat& m)
245     {
246         offset = (int)m.offset;
247         step = (int)m.step;
248         rows = m.rows;
249         cols = m.cols;
250     }
251     int offset;
252     int step;
253     int rows;
254     int cols;
255 };
256
257 struct UMat3D
258 {
259     UMat3D(const UMat& m)
260     {
261         offset = (int)m.offset;
262         step = (int)m.step.p[1];
263         slicestep = (int)m.step.p[0];
264         slices = (int)m.size.p[0];
265         rows = m.size.p[1];
266         cols = m.size.p[2];
267     }
268     int offset;
269     int slicestep;
270     int step;
271     int slices;
272     int rows;
273     int cols;
274 };
275
276 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
277 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
278 {
279     static uint64 table[256];
280     static bool initialized = false;
281
282     if( !initialized )
283     {
284         for( int i = 0; i < 256; i++ )
285         {
286             uint64 c = i;
287             for( int j = 0; j < 8; j++ )
288                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
289             table[i] = c;
290         }
291         initialized = true;
292     }
293
294     uint64 crc = ~crc0;
295     for( size_t idx = 0; idx < size; idx++ )
296         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
297
298     return ~crc;
299 }
300
301 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
302 struct OpenCLBinaryCacheConfigurator
303 {
304     cv::String cache_path_;
305     cv::String cache_lock_filename_;
306     cv::Ptr<utils::fs::FileLock> cache_lock_;
307
308     typedef std::map<std::string, std::string> ContextCacheType;
309     ContextCacheType prepared_contexts_;
310     Mutex mutex_prepared_contexts_;
311
312     OpenCLBinaryCacheConfigurator()
313     {
314         CV_LOG_DEBUG(NULL, "Initializing OpenCL cache configuration...");
315         if (!CV_OPENCL_CACHE_ENABLE)
316         {
317             CV_LOG_INFO(NULL, "OpenCL cache is disabled");
318             return;
319         }
320         cache_path_ = utils::fs::getCacheDirectory("opencl_cache", "OPENCV_OPENCL_CACHE_DIR");
321         if (cache_path_.empty())
322         {
323             CV_LOG_INFO(NULL, "Specify OPENCV_OPENCL_CACHE_DIR configuration parameter to enable OpenCL cache");
324         }
325         do
326         {
327             try
328             {
329                 if (cache_path_.empty())
330                     break;
331                 if (cache_path_ == "disabled")
332                     break;
333                 if (!utils::fs::createDirectories(cache_path_))
334                 {
335                     CV_LOG_DEBUG(NULL, "Can't use OpenCL cache directory: " << cache_path_);
336                     clear();
337                     break;
338                 }
339
340                 if (CV_OPENCL_CACHE_LOCK_ENABLE)
341                 {
342                     cache_lock_filename_ = cache_path_ + ".lock";
343                     if (!utils::fs::exists(cache_lock_filename_))
344                     {
345                         CV_LOG_DEBUG(NULL, "Creating lock file... (" << cache_lock_filename_ << ")");
346                         std::ofstream lock_filename(cache_lock_filename_.c_str(), std::ios::out);
347                         if (!lock_filename.is_open())
348                         {
349                             CV_LOG_WARNING(NULL, "Can't create lock file for OpenCL program cache: " << cache_lock_filename_);
350                             break;
351                         }
352                     }
353
354                     try
355                     {
356                         cache_lock_ = makePtr<utils::fs::FileLock>(cache_lock_filename_.c_str());
357                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... (" << cache_lock_filename_ << ")");
358                         {
359                             utils::shared_lock_guard<utils::fs::FileLock> lock(*cache_lock_);
360                         }
361                         CV_LOG_VERBOSE(NULL, 0, "Checking cache lock... Done!");
362                     }
363                     catch (const cv::Exception& e)
364                     {
365                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_ << std::endl << e.what());
366                     }
367                     catch (...)
368                     {
369                         CV_LOG_WARNING(NULL, "Can't create OpenCL program cache lock: " << cache_lock_filename_);
370                     }
371                 }
372                 else
373                 {
374                     if (CV_OPENCL_CACHE_WRITE)
375                     {
376                         CV_LOG_WARNING(NULL, "OpenCL cache lock is disabled while cache write is allowed "
377                                 "(not safe for multiprocess environment)");
378                     }
379                     else
380                     {
381                         CV_LOG_INFO(NULL, "OpenCL cache lock is disabled");
382                     }
383                 }
384             }
385             catch (const cv::Exception& e)
386             {
387                 CV_LOG_WARNING(NULL, "Can't prepare OpenCL program cache: " << cache_path_ << std::endl << e.what());
388                 clear();
389             }
390         } while (0);
391         if (!cache_path_.empty())
392         {
393             if (cache_lock_.empty() && CV_OPENCL_CACHE_LOCK_ENABLE)
394             {
395                 CV_LOG_WARNING(NULL, "Initialized OpenCL cache directory, but interprocess synchronization lock is not available. "
396                         "Consider to disable OpenCL cache: OPENCV_OPENCL_CACHE_DIR=disabled");
397             }
398             else
399             {
400                 CV_LOG_INFO(NULL, "Successfully initialized OpenCL cache directory: " << cache_path_);
401             }
402         }
403     }
404
405     void clear()
406     {
407         cache_path_.clear();
408         cache_lock_filename_.clear();
409         cache_lock_.release();
410     }
411
412     std::string prepareCacheDirectoryForContext(const std::string& ctx_prefix,
413             const std::string& cleanup_prefix)
414     {
415         if (cache_path_.empty())
416             return std::string();
417
418         AutoLock lock(mutex_prepared_contexts_);
419
420         ContextCacheType::iterator found_it = prepared_contexts_.find(ctx_prefix);
421         if (found_it != prepared_contexts_.end())
422             return found_it->second;
423
424         CV_LOG_INFO(NULL, "Preparing OpenCL cache configuration for context: " << ctx_prefix);
425
426         std::string target_directory = cache_path_ + ctx_prefix + "/";
427         bool result = utils::fs::isDirectory(target_directory);
428         if (!result)
429         {
430             try
431             {
432                 CV_LOG_VERBOSE(NULL, 0, "Creating directory: " << target_directory);
433                 if (utils::fs::createDirectories(target_directory))
434                 {
435                     result = true;
436                 }
437                 else
438                 {
439                     CV_LOG_WARNING(NULL, "Can't create directory: " << target_directory);
440                 }
441             }
442             catch (const cv::Exception& e)
443             {
444                 CV_LOG_ERROR(NULL, "Can't create OpenCL program cache directory for context: " << target_directory << std::endl << e.what());
445             }
446         }
447         target_directory = result ? target_directory : std::string();
448         prepared_contexts_.insert(std::pair<std::string, std::string>(ctx_prefix, target_directory));
449
450         if (result && CV_OPENCL_CACHE_CLEANUP && CV_OPENCL_CACHE_WRITE && !cleanup_prefix.empty())
451         {
452             try
453             {
454                 std::vector<String> entries;
455                 utils::fs::glob_relative(cache_path_, cleanup_prefix + "*", entries, false, true);
456                 std::vector<String> remove_entries;
457                 for (size_t i = 0; i < entries.size(); i++)
458                 {
459                     const String& name = entries[i];
460                     if (0 == name.find(cleanup_prefix))
461                     {
462                         if (0 == name.find(ctx_prefix))
463                             continue; // skip current
464                         remove_entries.push_back(name);
465                     }
466                 }
467                 if (!remove_entries.empty())
468                 {
469                     CV_LOG_WARNING(NULL, (remove_entries.size() == 1
470                             ? "Detected OpenCL cache directory for other version of OpenCL device."
471                             : "Detected OpenCL cache directories for other versions of OpenCL device.")
472                             << " We assume that these directories are obsolete after OpenCL runtime/drivers upgrade.");
473                     CV_LOG_WARNING(NULL, "Trying to remove these directories...");
474                     for (size_t i = 0; i < remove_entries.size(); i++)
475                     {
476                         CV_LOG_WARNING(NULL, "- " << remove_entries[i]);
477                     }
478                     CV_LOG_WARNING(NULL, "Note: You can disable this behavior via this option: OPENCV_OPENCL_CACHE_CLEANUP=0");
479
480                     for (size_t i = 0; i < remove_entries.size(); i++)
481                     {
482                         const String& name = remove_entries[i];
483                         cv::String path = utils::fs::join(cache_path_, name);
484                         try
485                         {
486                             utils::fs::remove_all(path);
487                             CV_LOG_WARNING(NULL, "Removed: " << path);
488                         }
489                         catch (const cv::Exception& e)
490                         {
491                             CV_LOG_ERROR(NULL, "Exception during removal of obsolete OpenCL cache directory: " << path << std::endl << e.what());
492                         }
493                     }
494                 }
495             }
496             catch (...)
497             {
498                 CV_LOG_WARNING(NULL, "Can't check for obsolete OpenCL cache directories");
499             }
500         }
501
502         CV_LOG_VERBOSE(NULL, 1, "  Result: " << (target_directory.empty() ? std::string("Failed") : target_directory));
503         return target_directory;
504     }
505
506     static OpenCLBinaryCacheConfigurator& getSingletonInstance()
507     {
508         CV_SINGLETON_LAZY_INIT_REF(OpenCLBinaryCacheConfigurator, new OpenCLBinaryCacheConfigurator());
509     }
510 };
511 class BinaryProgramFile
512 {
513     enum { MAX_ENTRIES = 64 };
514
515     typedef unsigned int uint32_t;
516
517     struct CV_DECL_ALIGNED(4) FileHeader
518     {
519         uint32_t sourceSignatureSize;
520         //char sourceSignature[];
521     };
522
523     struct CV_DECL_ALIGNED(4) FileTable
524     {
525         uint32_t numberOfEntries;
526         //uint32_t firstEntryOffset[];
527     };
528
529     struct CV_DECL_ALIGNED(4) FileEntry
530     {
531         uint32_t nextEntryFileOffset; // 0 for the last entry in chain
532         uint32_t keySize;
533         uint32_t dataSize;
534         //char key[];
535         //char data[];
536     };
537
538     const std::string fileName_;
539     const char* const sourceSignature_;
540     const size_t sourceSignatureSize_;
541
542     std::fstream f;
543
544     uint32_t entryOffsets[MAX_ENTRIES];
545
546     uint32_t getHash(const std::string& options)
547     {
548         uint64 hash = crc64((const uchar*)options.c_str(), options.size(), 0);
549         return hash & (MAX_ENTRIES - 1);
550     }
551
552     inline size_t getFileSize()
553     {
554         size_t pos = (size_t)f.tellg();
555         f.seekg(0, std::fstream::end);
556         size_t fileSize = (size_t)f.tellg();
557         f.seekg(pos, std::fstream::beg);
558         return fileSize;
559     }
560     inline uint32_t readUInt32()
561     {
562         uint32_t res = 0;
563         f.read((char*)&res, sizeof(uint32_t));
564         CV_Assert(!f.fail());
565         return res;
566     }
567     inline void writeUInt32(const uint32_t value)
568     {
569         uint32_t v = value;
570         f.write((char*)&v, sizeof(uint32_t));
571         CV_Assert(!f.fail());
572     }
573
574     inline void seekReadAbsolute(size_t pos)
575     {
576         f.seekg(pos, std::fstream::beg);
577         CV_Assert(!f.fail());
578     }
579     inline void seekReadRelative(size_t pos)
580     {
581         f.seekg(pos, std::fstream::cur);
582         CV_Assert(!f.fail());
583     }
584
585     inline void seekWriteAbsolute(size_t pos)
586     {
587         f.seekp(pos, std::fstream::beg);
588         CV_Assert(!f.fail());
589     }
590
591     void clearFile()
592     {
593         f.close();
594         if (0 != remove(fileName_.c_str()))
595             CV_LOG_ERROR(NULL, "Can't remove: " << fileName_);
596         return;
597     }
598
599 public:
600     BinaryProgramFile(const std::string& fileName, const char* sourceSignature)
601         : fileName_(fileName), sourceSignature_(sourceSignature), sourceSignatureSize_(sourceSignature_ ? strlen(sourceSignature_) : 0)
602     {
603         CV_StaticAssert(sizeof(uint32_t) == 4, "");
604         CV_Assert(sourceSignature_ != NULL);
605         CV_Assert(sourceSignatureSize_ > 0);
606         memset(entryOffsets, 0, sizeof(entryOffsets));
607
608         f.rdbuf()->pubsetbuf(0, 0); // disable buffering
609         f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
610         if(f.is_open() && getFileSize() > 0)
611         {
612             bool isValid = false;
613             try
614             {
615                 uint32_t fileSourceSignatureSize = readUInt32();
616                 if (fileSourceSignatureSize == sourceSignatureSize_)
617                 {
618                     cv::AutoBuffer<char> fileSourceSignature(fileSourceSignatureSize + 1);
619                     f.read(fileSourceSignature.data(), fileSourceSignatureSize);
620                     if (f.eof())
621                     {
622                         CV_LOG_ERROR(NULL, "Unexpected EOF");
623                     }
624                     else if (memcmp(sourceSignature, fileSourceSignature.data(), fileSourceSignatureSize) == 0)
625                     {
626                         isValid = true;
627                     }
628                 }
629                 if (!isValid)
630                 {
631                     CV_LOG_ERROR(NULL, "Source code signature/hash mismatch (program source code has been changed/updated)");
632                 }
633             }
634             catch (const cv::Exception& e)
635             {
636                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : " << e.what());
637             }
638             catch (...)
639             {
640                 CV_LOG_ERROR(NULL, "Can't open binary program file: " << fileName << " : Unknown error");
641             }
642             if (!isValid)
643             {
644                 clearFile();
645             }
646             else
647             {
648                 seekReadAbsolute(0);
649             }
650         }
651     }
652
653     bool read(const std::string& key, std::vector<char>& buf)
654     {
655         if (!f.is_open())
656             return false;
657
658         size_t fileSize = getFileSize();
659         if (fileSize == 0)
660         {
661             CV_LOG_ERROR(NULL, "Invalid file (empty): " << fileName_);
662             clearFile();
663             return false;
664         }
665         seekReadAbsolute(0);
666
667         // bypass FileHeader
668         uint32_t fileSourceSignatureSize = readUInt32();
669         CV_Assert(fileSourceSignatureSize > 0);
670         seekReadRelative(fileSourceSignatureSize);
671
672         uint32_t numberOfEntries = readUInt32();
673         CV_Assert(numberOfEntries > 0);
674         if (numberOfEntries != MAX_ENTRIES)
675         {
676             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
677             clearFile();
678             return false;
679         }
680         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
681         CV_Assert(!f.fail());
682
683         uint32_t entryNum = getHash(key);
684
685         uint32_t entryOffset = entryOffsets[entryNum];
686         FileEntry entry;
687         while (entryOffset > 0)
688         {
689             seekReadAbsolute(entryOffset);
690             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
691             f.read((char*)&entry, sizeof(entry));
692             CV_Assert(!f.fail());
693             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
694             if (key.size() == entry.keySize)
695             {
696                 if (entry.keySize > 0)
697                 {
698                     f.read(fileKey.data(), entry.keySize);
699                     CV_Assert(!f.fail());
700                 }
701                 if (memcmp(fileKey.data(), key.c_str(), entry.keySize) == 0)
702                 {
703                     buf.resize(entry.dataSize);
704                     f.read(&buf[0], entry.dataSize);
705                     CV_Assert(!f.fail());
706                     seekReadAbsolute(0);
707                     CV_LOG_VERBOSE(NULL, 0, "Read...");
708                     return true;
709                 }
710             }
711             if (entry.nextEntryFileOffset == 0)
712                 break;
713             entryOffset = entry.nextEntryFileOffset;
714         }
715         return false;
716     }
717
718     bool write(const std::string& key, std::vector<char>& buf)
719     {
720         if (!f.is_open())
721         {
722             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
723             if (!f.is_open())
724             {
725                 f.open(fileName_.c_str(), std::ios::out|std::ios::binary);
726                 if (!f.is_open())
727                 {
728                     CV_LOG_ERROR(NULL, "Can't create file: " << fileName_);
729                     return false;
730                 }
731             }
732         }
733
734         size_t fileSize = getFileSize();
735         if (fileSize == 0)
736         {
737             // Write header
738             seekWriteAbsolute(0);
739             writeUInt32((uint32_t)sourceSignatureSize_);
740             f.write(sourceSignature_, sourceSignatureSize_);
741             CV_Assert(!f.fail());
742
743             writeUInt32(MAX_ENTRIES);
744             memset(entryOffsets, 0, sizeof(entryOffsets));
745             f.write((char*)entryOffsets, sizeof(entryOffsets));
746             CV_Assert(!f.fail());
747             f.flush();
748             CV_Assert(!f.fail());
749             f.close();
750             f.open(fileName_.c_str(), std::ios::in|std::ios::out|std::ios::binary);
751             CV_Assert(f.is_open());
752             fileSize = getFileSize();
753         }
754         seekReadAbsolute(0);
755
756         // bypass FileHeader
757         uint32_t fileSourceSignatureSize = readUInt32();
758         CV_Assert(fileSourceSignatureSize == sourceSignatureSize_);
759         seekReadRelative(fileSourceSignatureSize);
760
761         uint32_t numberOfEntries = readUInt32();
762         CV_Assert(numberOfEntries > 0);
763         if (numberOfEntries != MAX_ENTRIES)
764         {
765             CV_LOG_ERROR(NULL, "Invalid file: " << fileName_);
766             clearFile();
767             return false;
768         }
769         size_t tableEntriesOffset = (size_t)f.tellg();
770         f.read((char*)&entryOffsets[0], sizeof(entryOffsets));
771         CV_Assert(!f.fail());
772
773         uint32_t entryNum = getHash(key);
774
775         uint32_t entryOffset = entryOffsets[entryNum];
776         FileEntry entry;
777         while (entryOffset > 0)
778         {
779             seekReadAbsolute(entryOffset);
780             //CV_StaticAssert(sizeof(entry) == sizeof(uint32_t) * 3, "");
781             f.read((char*)&entry, sizeof(entry));
782             CV_Assert(!f.fail());
783             cv::AutoBuffer<char> fileKey(entry.keySize + 1);
784             if (key.size() == entry.keySize)
785             {
786                 if (entry.keySize > 0)
787                 {
788                     f.read(fileKey.data(), entry.keySize);
789                     CV_Assert(!f.fail());
790                 }
791                 if (0 == memcmp(fileKey.data(), key.c_str(), entry.keySize))
792                 {
793                     // duplicate
794                     CV_LOG_VERBOSE(NULL, 0, "Duplicate key ignored: " << fileName_);
795                     return false;
796                 }
797             }
798             if (entry.nextEntryFileOffset == 0)
799                 break;
800             entryOffset = entry.nextEntryFileOffset;
801         }
802         seekReadAbsolute(0);
803         if (entryOffset > 0)
804         {
805             seekWriteAbsolute(entryOffset);
806             entry.nextEntryFileOffset = (uint32_t)fileSize;
807             f.write((char*)&entry, sizeof(entry));
808             CV_Assert(!f.fail());
809         }
810         else
811         {
812             entryOffsets[entryNum] = (uint32_t)fileSize;
813             seekWriteAbsolute(tableEntriesOffset);
814             f.write((char*)entryOffsets, sizeof(entryOffsets));
815             CV_Assert(!f.fail());
816         }
817         seekWriteAbsolute(fileSize);
818         entry.nextEntryFileOffset = 0;
819         entry.dataSize = (uint32_t)buf.size();
820         entry.keySize = (uint32_t)key.size();
821         f.write((char*)&entry, sizeof(entry));
822         CV_Assert(!f.fail());
823         f.write(key.c_str(), entry.keySize);
824         CV_Assert(!f.fail());
825         f.write(&buf[0], entry.dataSize);
826         CV_Assert(!f.fail());
827         f.flush();
828         CV_Assert(!f.fail());
829         CV_LOG_VERBOSE(NULL, 0, "Write... (" << buf.size() << " bytes)");
830         return true;
831     }
832 };
833 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
834
835
836
837 struct OpenCLExecutionContext::Impl
838 {
839     ocl::Context context_;
840     int device_;  // device index in context
841     ocl::Queue queue_;
842     int useOpenCL_;
843
844 protected:
845     Impl() = delete;
846
847     void _init_device(cl_device_id deviceID)
848     {
849         CV_Assert(deviceID);
850         int ndevices = (int)context_.ndevices();
851         CV_Assert(ndevices > 0);
852         bool found = false;
853         for (int i = 0; i < ndevices; i++)
854         {
855             ocl::Device d = context_.device(i);
856             cl_device_id dhandle = (cl_device_id)d.ptr();
857             if (dhandle == deviceID)
858             {
859                 device_ = i;
860                 found = true;
861                 break;
862             }
863         }
864         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
865     }
866
867     void _init_device(const ocl::Device& device)
868     {
869         CV_Assert(device.ptr());
870         int ndevices = (int)context_.ndevices();
871         CV_Assert(ndevices > 0);
872         bool found = false;
873         for (int i = 0; i < ndevices; i++)
874         {
875             ocl::Device d = context_.device(i);
876             if (d.getImpl() == device.getImpl())
877             {
878                 device_ = i;
879                 found = true;
880                 break;
881             }
882         }
883         CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
884     }
885
886 public:
887     Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
888         : device_(0), useOpenCL_(-1)
889     {
890         CV_UNUSED(platformID);
891         CV_Assert(context);
892         CV_Assert(deviceID);
893
894         context_ = Context::fromHandle(context);
895         _init_device(deviceID);
896         queue_ = Queue(context_, context_.device(device_));
897     }
898
899     Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
900         : device_(0), useOpenCL_(-1)
901     {
902         CV_Assert(context.ptr());
903         CV_Assert(device.ptr());
904
905         context_ = context;
906         _init_device(device);
907         queue_ = queue;
908     }
909
910     Impl(const ocl::Context& context, const ocl::Device& device)
911         : device_(0), useOpenCL_(-1)
912     {
913         CV_Assert(context.ptr());
914         CV_Assert(device.ptr());
915
916         context_ = context;
917         _init_device(device);
918         queue_ = Queue(context_, context_.device(device_));
919     }
920
921     Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
922         : context_(context)
923         , device_(device)
924         , queue_(queue)
925         , useOpenCL_(-1)
926     {
927         // nothing
928     }
929     Impl(const Impl& other)
930         : context_(other.context_)
931         , device_(other.device_)
932         , queue_(other.queue_)
933         , useOpenCL_(-1)
934     {
935         // nothing
936     }
937
938     inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
939     bool useOpenCL()
940     {
941         if (useOpenCL_ < 0)
942         {
943             try
944             {
945                 useOpenCL_ = 0;
946                 if (!context_.empty() && context_.ndevices() > 0)
947                 {
948                     const Device& d = context_.device(device_);
949                     useOpenCL_ = d.available();
950                 }
951             }
952             catch (const cv::Exception&)
953             {
954                 // nothing
955             }
956             if (!useOpenCL_)
957                 CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
958         }
959         return useOpenCL_ > 0;
960     }
961
962     void setUseOpenCL(bool flag)
963     {
964         if (!flag)
965             useOpenCL_ = 0;
966         else
967             useOpenCL_ = -1;
968     }
969
970     static const std::shared_ptr<Impl>& getInitializedExecutionContext()
971     {
972         CV_TRACE_FUNCTION();
973
974         CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
975
976         static bool initialized = false;
977         static std::shared_ptr<Impl> g_primaryExecutionContext;
978
979         if (!initialized)
980         {
981             cv::AutoLock lock(getInitializationMutex());
982             if (!initialized)
983             {
984                 CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
985                 try
986                 {
987                     Context c = ocl::Context::create(std::string());
988                     if (c.ndevices())
989                     {
990                         int deviceId = 0;
991                         auto& d = c.device(deviceId);
992                         if (d.available())
993                         {
994                             auto q = ocl::Queue(c, d);
995                             if (!q.ptr())
996                             {
997                                 CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
998                             }
999                             else
1000                             {
1001                                 g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
1002                                 CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
1003                             }
1004                         }
1005                         else
1006                         {
1007                             CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
1008                         }
1009                     }
1010                     else
1011                     {
1012                         CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
1013                     }
1014                 }
1015                 catch (const std::exception& e)
1016                 {
1017                     CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
1018                 }
1019                 catch (...)
1020                 {
1021                     CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
1022                 }
1023                 initialized = true;
1024             }
1025         }
1026         return g_primaryExecutionContext;
1027     }
1028 };
1029
1030 Context& OpenCLExecutionContext::getContext() const
1031 {
1032     CV_Assert(p);
1033     return p->context_;
1034 }
1035 Device& OpenCLExecutionContext::getDevice() const
1036 {
1037     CV_Assert(p);
1038     return p->context_.device(p->device_);
1039 }
1040 Queue& OpenCLExecutionContext::getQueue() const
1041 {
1042     CV_Assert(p);
1043     return p->queue_;
1044 }
1045
1046 bool OpenCLExecutionContext::useOpenCL() const
1047 {
1048     if (p)
1049         return p->useOpenCL();
1050     return false;
1051 }
1052 void OpenCLExecutionContext::setUseOpenCL(bool flag)
1053 {
1054     CV_Assert(p);
1055     p->setUseOpenCL(flag);
1056 }
1057
1058 /* static */
1059 OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
1060 {
1061     CV_TRACE_FUNCTION();
1062     CoreTLSData& data = getCoreTlsData();
1063     OpenCLExecutionContext& c = data.oclExecutionContext;
1064     if (!data.oclExecutionContextInitialized)
1065     {
1066         data.oclExecutionContextInitialized = true;
1067         if (c.empty() && haveOpenCL())
1068             c.p = Impl::getInitializedExecutionContext();
1069     }
1070     return c;
1071 }
1072
1073 /* static */
1074 OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
1075 {
1076     CV_TRACE_FUNCTION();
1077     CoreTLSData& data = getCoreTlsData();
1078     OpenCLExecutionContext& c = data.oclExecutionContext;
1079     return c;
1080 }
1081
1082 void OpenCLExecutionContext::bind() const
1083 {
1084     CV_TRACE_FUNCTION();
1085     CV_Assert(p);
1086     CoreTLSData& data = getCoreTlsData();
1087     data.oclExecutionContext = *this;
1088     data.oclExecutionContextInitialized = true;
1089     data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
1090 }
1091
1092
1093 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
1094 {
1095     CV_TRACE_FUNCTION();
1096     CV_Assert(p);
1097     const Queue q(getContext(), getDevice());
1098     return cloneWithNewQueue(q);
1099 }
1100
1101 OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
1102 {
1103     CV_TRACE_FUNCTION();
1104     CV_Assert(p);
1105     CV_Assert(q.ptr() != NULL);
1106     OpenCLExecutionContext c;
1107     c.p = std::make_shared<Impl>(p->context_, p->device_, q);
1108     return c;
1109 }
1110
1111 /* static */
1112 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
1113 {
1114     CV_TRACE_FUNCTION();
1115     if (!haveOpenCL())
1116         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1117
1118     CV_Assert(!context.empty());
1119     CV_Assert(context.ptr());
1120     CV_Assert(!device.empty());
1121     CV_Assert(device.ptr());
1122     OpenCLExecutionContext ctx;
1123     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
1124     return ctx;
1125
1126 }
1127
1128 /* static */
1129 OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
1130 {
1131     CV_TRACE_FUNCTION();
1132     if (!haveOpenCL())
1133         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
1134
1135     CV_Assert(!context.empty());
1136     CV_Assert(context.ptr());
1137     CV_Assert(!device.empty());
1138     CV_Assert(device.ptr());
1139     OpenCLExecutionContext ctx;
1140     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
1141     return ctx;
1142
1143 }
1144
1145 void OpenCLExecutionContext::release()
1146 {
1147     CV_TRACE_FUNCTION();
1148     p.reset();
1149 }
1150
1151
1152
1153 // true if we have initialized OpenCL subsystem with available platforms
1154 static bool g_isOpenCLInitialized = false;
1155 static bool g_isOpenCLAvailable = false;
1156
1157 bool haveOpenCL()
1158 {
1159     CV_TRACE_FUNCTION();
1160
1161     if (!g_isOpenCLInitialized)
1162     {
1163         CV_TRACE_REGION("Init_OpenCL_Runtime");
1164         const char* envPath = getenv("OPENCV_OPENCL_RUNTIME");
1165         if (envPath)
1166         {
1167             if (cv::String(envPath) == "disabled")
1168             {
1169                 g_isOpenCLAvailable = false;
1170                 g_isOpenCLInitialized = true;
1171                 return false;
1172             }
1173         }
1174
1175         cv::AutoLock lock(getInitializationMutex());
1176         CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
1177         try
1178         {
1179             cl_uint n = 0;
1180             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1181             g_isOpenCLAvailable &= n > 0;
1182             CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
1183         }
1184         catch (...)
1185         {
1186             g_isOpenCLAvailable = false;
1187         }
1188         g_isOpenCLInitialized = true;
1189     }
1190     return g_isOpenCLAvailable;
1191 }
1192
1193 bool useOpenCL()
1194 {
1195     CoreTLSData& data = getCoreTlsData();
1196     if (data.useOpenCL < 0)
1197     {
1198         try
1199         {
1200             data.useOpenCL = 0;
1201             if (haveOpenCL())
1202             {
1203                 auto c = OpenCLExecutionContext::getCurrent();
1204                 data.useOpenCL = c.useOpenCL();
1205             }
1206         }
1207         catch (...)
1208         {
1209             CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
1210         }
1211     }
1212     return data.useOpenCL > 0;
1213 }
1214
1215 bool isOpenCLActivated()
1216 {
1217     if (!g_isOpenCLAvailable)
1218         return false; // prevent unnecessary OpenCL activation via useOpenCL()->haveOpenCL() calls
1219     return useOpenCL();
1220 }
1221
1222 void setUseOpenCL(bool flag)
1223 {
1224     CV_TRACE_FUNCTION();
1225
1226     CoreTLSData& data = getCoreTlsData();
1227     auto& c = OpenCLExecutionContext::getCurrentRef();
1228     if (!c.empty())
1229     {
1230         c.setUseOpenCL(flag);
1231         data.useOpenCL = c.useOpenCL();
1232     }
1233     else
1234     {
1235         if (!flag)
1236             data.useOpenCL = 0;
1237         else
1238             data.useOpenCL = -1; // enabled by default (if context is not initialized)
1239     }
1240 }
1241
1242
1243
1244 #ifdef HAVE_CLAMDBLAS
1245
1246 class AmdBlasHelper
1247 {
1248 public:
1249     static AmdBlasHelper & getInstance()
1250     {
1251         CV_SINGLETON_LAZY_INIT_REF(AmdBlasHelper, new AmdBlasHelper())
1252     }
1253
1254     bool isAvailable() const
1255     {
1256         return g_isAmdBlasAvailable;
1257     }
1258
1259     ~AmdBlasHelper()
1260     {
1261         try
1262         {
1263             clAmdBlasTeardown();
1264         }
1265         catch (...) { }
1266     }
1267
1268 protected:
1269     AmdBlasHelper()
1270     {
1271         if (!g_isAmdBlasInitialized)
1272         {
1273             AutoLock lock(getInitializationMutex());
1274
1275             if (!g_isAmdBlasInitialized)
1276             {
1277                 if (haveOpenCL())
1278                 {
1279                     try
1280                     {
1281                         g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1282                     }
1283                     catch (...)
1284                     {
1285                         g_isAmdBlasAvailable = false;
1286                     }
1287                 }
1288                 else
1289                     g_isAmdBlasAvailable = false;
1290
1291                 g_isAmdBlasInitialized = true;
1292             }
1293         }
1294     }
1295
1296 private:
1297     static bool g_isAmdBlasInitialized;
1298     static bool g_isAmdBlasAvailable;
1299 };
1300
1301 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1302 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1303
1304 bool haveAmdBlas()
1305 {
1306     return AmdBlasHelper::getInstance().isAvailable();
1307 }
1308
1309 #else
1310
1311 bool haveAmdBlas()
1312 {
1313     return false;
1314 }
1315
1316 #endif
1317
1318 #ifdef HAVE_CLAMDFFT
1319
1320 class AmdFftHelper
1321 {
1322 public:
1323     static AmdFftHelper & getInstance()
1324     {
1325         CV_SINGLETON_LAZY_INIT_REF(AmdFftHelper, new AmdFftHelper())
1326     }
1327
1328     bool isAvailable() const
1329     {
1330         return g_isAmdFftAvailable;
1331     }
1332
1333     ~AmdFftHelper()
1334     {
1335         try
1336         {
1337 //            clAmdFftTeardown();
1338         }
1339         catch (...) { }
1340     }
1341
1342 protected:
1343     AmdFftHelper()
1344     {
1345         if (!g_isAmdFftInitialized)
1346         {
1347             AutoLock lock(getInitializationMutex());
1348
1349             if (!g_isAmdFftInitialized)
1350             {
1351                 if (haveOpenCL())
1352                 {
1353                     try
1354                     {
1355                         cl_uint major, minor, patch;
1356                         CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1357
1358                         // it throws exception in case AmdFft binaries are not found
1359                         CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1360                         g_isAmdFftAvailable = true;
1361                     }
1362                     catch (const Exception &)
1363                     {
1364                         g_isAmdFftAvailable = false;
1365                     }
1366                 }
1367                 else
1368                     g_isAmdFftAvailable = false;
1369
1370                 g_isAmdFftInitialized = true;
1371             }
1372         }
1373     }
1374
1375 private:
1376     static clAmdFftSetupData setupData;
1377     static bool g_isAmdFftInitialized;
1378     static bool g_isAmdFftAvailable;
1379 };
1380
1381 clAmdFftSetupData AmdFftHelper::setupData;
1382 bool AmdFftHelper::g_isAmdFftAvailable = false;
1383 bool AmdFftHelper::g_isAmdFftInitialized = false;
1384
1385 bool haveAmdFft()
1386 {
1387     return AmdFftHelper::getInstance().isAvailable();
1388 }
1389
1390 #else
1391
1392 bool haveAmdFft()
1393 {
1394     return false;
1395 }
1396
1397 #endif
1398
1399 bool haveSVM()
1400 {
1401 #ifdef HAVE_OPENCL_SVM
1402     return true;
1403 #else
1404     return false;
1405 #endif
1406 }
1407
1408 void finish()
1409 {
1410     Queue::getDefault().finish();
1411 }
1412
1413 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1414
1415 struct Platform::Impl
1416 {
1417     Impl()
1418     {
1419         refcount = 1;
1420         handle = 0;
1421         initialized = false;
1422     }
1423
1424     ~Impl() {}
1425
1426     void init()
1427     {
1428         if( !initialized )
1429         {
1430             //cl_uint num_entries
1431             cl_uint n = 0;
1432             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1433                 handle = 0;
1434             if( handle != 0 )
1435             {
1436                 char buf[1000];
1437                 size_t len = 0;
1438                 CV_OCL_DBG_CHECK(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len));
1439                 buf[len] = '\0';
1440                 vendor = String(buf);
1441             }
1442
1443             initialized = true;
1444         }
1445     }
1446
1447     IMPLEMENT_REFCOUNTABLE();
1448
1449     cl_platform_id handle;
1450     String vendor;
1451     bool initialized;
1452 };
1453
1454 Platform::Platform()
1455 {
1456     p = 0;
1457 }
1458
1459 Platform::~Platform()
1460 {
1461     if(p)
1462         p->release();
1463 }
1464
1465 Platform::Platform(const Platform& pl)
1466 {
1467     p = (Impl*)pl.p;
1468     if(p)
1469         p->addref();
1470 }
1471
1472 Platform& Platform::operator = (const Platform& pl)
1473 {
1474     Impl* newp = (Impl*)pl.p;
1475     if(newp)
1476         newp->addref();
1477     if(p)
1478         p->release();
1479     p = newp;
1480     return *this;
1481 }
1482
1483 void* Platform::ptr() const
1484 {
1485     return p ? p->handle : 0;
1486 }
1487
1488 Platform& Platform::getDefault()
1489 {
1490     CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
1491     static Platform p;
1492     if( !p.p )
1493     {
1494         p.p = new Impl;
1495         p.p->init();
1496     }
1497     return p;
1498 }
1499
1500 /////////////////////////////////////// Device ////////////////////////////////////////////
1501
1502 // Version has format:
1503 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1504 // by specification
1505 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1506 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1507 //   https://www.khronos.org/registry/OpenCL/sdk/1.1/docs/man/xhtml/clGetPlatformInfo.html
1508 //   https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clGetPlatformInfo.html
1509 static void parseOpenCLVersion(const String &version, int &major, int &minor)
1510 {
1511     major = minor = 0;
1512     if (10 >= version.length())
1513         return;
1514     const char *pstr = version.c_str();
1515     if (0 != strncmp(pstr, "OpenCL ", 7))
1516         return;
1517     size_t ppos = version.find('.', 7);
1518     if (String::npos == ppos)
1519         return;
1520     String temp = version.substr(7, ppos - 7);
1521     major = atoi(temp.c_str());
1522     temp = version.substr(ppos + 1);
1523     minor = atoi(temp.c_str());
1524 }
1525
1526 struct Device::Impl
1527 {
1528     Impl(void* d)
1529         : refcount(1)
1530         , handle(0)
1531     {
1532         try
1533         {
1534             cl_device_id device = (cl_device_id)d;
1535             _init(device);
1536             CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
1537         }
1538         catch (...)
1539         {
1540             throw;
1541         }
1542     }
1543
1544     void _init(cl_device_id d)
1545     {
1546         handle = (cl_device_id)d;
1547
1548         name_ = getStrProp(CL_DEVICE_NAME);
1549         version_ = getStrProp(CL_DEVICE_VERSION);
1550         extensions_ = getStrProp(CL_DEVICE_EXTENSIONS);
1551         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1552         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1553         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1554         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1555         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1556         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1557         addressBits_ = getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS);
1558
1559         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1560         parseOpenCLVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1561
1562         size_t pos = 0;
1563         while (pos < extensions_.size())
1564         {
1565             size_t pos2 = extensions_.find(' ', pos);
1566             if (pos2 == String::npos)
1567                 pos2 = extensions_.size();
1568             if (pos2 > pos)
1569             {
1570                 std::string extensionName = extensions_.substr(pos, pos2 - pos);
1571                 extensions_set_.insert(extensionName);
1572             }
1573             pos = pos2 + 1;
1574         }
1575
1576         intelSubgroupsSupport_ = isExtensionSupported("cl_intel_subgroups");
1577
1578         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1579         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1580             vendorName_ == "AMD")
1581             vendorID_ = VENDOR_AMD;
1582         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1583             vendorID_ = VENDOR_INTEL;
1584         else if (vendorName_ == "NVIDIA Corporation")
1585             vendorID_ = VENDOR_NVIDIA;
1586         else
1587             vendorID_ = UNKNOWN_VENDOR;
1588
1589         const size_t CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE", 0);
1590         if (CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE > 0)
1591         {
1592             const size_t new_maxWorkGroupSize = std::min(maxWorkGroupSize_, CV_OPENCL_DEVICE_MAX_WORK_GROUP_SIZE);
1593             if (new_maxWorkGroupSize != maxWorkGroupSize_)
1594                 CV_LOG_WARNING(NULL, "OpenCL: using workgroup size: " << new_maxWorkGroupSize << " (was " << maxWorkGroupSize_ << ")");
1595             maxWorkGroupSize_ = new_maxWorkGroupSize;
1596         }
1597 #if 0
1598         if (isExtensionSupported("cl_khr_spir"))
1599         {
1600 #ifndef CL_DEVICE_SPIR_VERSIONS
1601 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
1602 #endif
1603             cv::String spir_versions = getStrProp(CL_DEVICE_SPIR_VERSIONS);
1604             std::cout << spir_versions << std::endl;
1605         }
1606 #endif
1607     }
1608
1609     ~Impl()
1610     {
1611 #ifdef _WIN32
1612         if (!cv::__termination)
1613 #endif
1614         {
1615             if (handle)
1616             {
1617                 CV_OCL_CHECK(clReleaseDevice(handle));
1618                 handle = 0;
1619             }
1620         }
1621     }
1622
1623     template<typename _TpCL, typename _TpOut>
1624     _TpOut getProp(cl_device_info prop) const
1625     {
1626         _TpCL temp=_TpCL();
1627         size_t sz = 0;
1628
1629         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1630             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1631     }
1632
1633     bool getBoolProp(cl_device_info prop) const
1634     {
1635         cl_bool temp = CL_FALSE;
1636         size_t sz = 0;
1637
1638         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1639             sz == sizeof(temp) ? temp != 0 : false;
1640     }
1641
1642     String getStrProp(cl_device_info prop) const
1643     {
1644         char buf[4096];
1645         size_t sz=0;
1646         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1647             sz < sizeof(buf) ? String(buf) : String();
1648     }
1649
1650     bool isExtensionSupported(const std::string& extensionName) const
1651     {
1652         return extensions_set_.count(extensionName) > 0;
1653     }
1654
1655
1656     IMPLEMENT_REFCOUNTABLE();
1657
1658     cl_device_id handle;
1659
1660     String name_;
1661     String version_;
1662     std::string extensions_;
1663     int doubleFPConfig_;
1664     bool hostUnifiedMemory_;
1665     int maxComputeUnits_;
1666     size_t maxWorkGroupSize_;
1667     int type_;
1668     int addressBits_;
1669     int deviceVersionMajor_;
1670     int deviceVersionMinor_;
1671     String driverVersion_;
1672     String vendorName_;
1673     int vendorID_;
1674     bool intelSubgroupsSupport_;
1675
1676     std::set<std::string> extensions_set_;
1677 };
1678
1679
1680 Device::Device()
1681 {
1682     p = 0;
1683 }
1684
1685 Device::Device(void* d)
1686 {
1687     p = 0;
1688     set(d);
1689 }
1690
1691 Device::Device(const Device& d)
1692 {
1693     p = d.p;
1694     if(p)
1695         p->addref();
1696 }
1697
1698 Device& Device::operator = (const Device& d)
1699 {
1700     Impl* newp = (Impl*)d.p;
1701     if(newp)
1702         newp->addref();
1703     if(p)
1704         p->release();
1705     p = newp;
1706     return *this;
1707 }
1708
1709 Device::~Device()
1710 {
1711     if(p)
1712         p->release();
1713 }
1714
1715 void Device::set(void* d)
1716 {
1717     if(p)
1718         p->release();
1719     p = new Impl(d);
1720     if (p->handle)
1721     {
1722         CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
1723     }
1724 }
1725
1726 Device Device::fromHandle(void* d)
1727 {
1728     Device device(d);
1729     return device;
1730 }
1731
1732 void* Device::ptr() const
1733 {
1734     return p ? p->handle : 0;
1735 }
1736
1737 String Device::name() const
1738 { return p ? p->name_ : String(); }
1739
1740 String Device::extensions() const
1741 { return p ? String(p->extensions_) : String(); }
1742
1743 bool Device::isExtensionSupported(const String& extensionName) const
1744 { return p ? p->isExtensionSupported(extensionName) : false; }
1745
1746 String Device::version() const
1747 { return p ? p->version_ : String(); }
1748
1749 String Device::vendorName() const
1750 { return p ? p->vendorName_ : String(); }
1751
1752 int Device::vendorID() const
1753 { return p ? p->vendorID_ : 0; }
1754
1755 String Device::OpenCL_C_Version() const
1756 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1757
1758 String Device::OpenCLVersion() const
1759 { return p ? p->getStrProp(CL_DEVICE_VERSION) : String(); }
1760
1761 int Device::deviceVersionMajor() const
1762 { return p ? p->deviceVersionMajor_ : 0; }
1763
1764 int Device::deviceVersionMinor() const
1765 { return p ? p->deviceVersionMinor_ : 0; }
1766
1767 String Device::driverVersion() const
1768 { return p ? p->driverVersion_ : String(); }
1769
1770 int Device::type() const
1771 { return p ? p->type_ : 0; }
1772
1773 int Device::addressBits() const
1774 { return p ? p->addressBits_ : 0; }
1775
1776 bool Device::available() const
1777 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1778
1779 bool Device::compilerAvailable() const
1780 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1781
1782 bool Device::linkerAvailable() const
1783 #ifdef CL_VERSION_1_2
1784 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1785 #else
1786 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1787 #endif
1788
1789 int Device::doubleFPConfig() const
1790 { return p ? p->doubleFPConfig_ : 0; }
1791
1792 int Device::singleFPConfig() const
1793 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1794
1795 int Device::halfFPConfig() const
1796 #ifdef CL_VERSION_1_2
1797 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1798 #else
1799 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1800 #endif
1801
1802 bool Device::endianLittle() const
1803 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1804
1805 bool Device::errorCorrectionSupport() const
1806 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1807
1808 int Device::executionCapabilities() const
1809 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1810
1811 size_t Device::globalMemCacheSize() const
1812 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1813
1814 int Device::globalMemCacheType() const
1815 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1816
1817 int Device::globalMemCacheLineSize() const
1818 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1819
1820 size_t Device::globalMemSize() const
1821 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1822
1823 size_t Device::localMemSize() const
1824 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1825
1826 int Device::localMemType() const
1827 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1828
1829 bool Device::hostUnifiedMemory() const
1830 { return p ? p->hostUnifiedMemory_ : false; }
1831
1832 bool Device::imageSupport() const
1833 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1834
1835 bool Device::imageFromBufferSupport() const
1836 {
1837     return p ? p->isExtensionSupported("cl_khr_image2d_from_buffer") : false;
1838 }
1839
1840 uint Device::imagePitchAlignment() const
1841 {
1842 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1843     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1844 #else
1845     return 0;
1846 #endif
1847 }
1848
1849 uint Device::imageBaseAddressAlignment() const
1850 {
1851 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1852     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1853 #else
1854     return 0;
1855 #endif
1856 }
1857
1858 size_t Device::image2DMaxWidth() const
1859 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1860
1861 size_t Device::image2DMaxHeight() const
1862 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1863
1864 size_t Device::image3DMaxWidth() const
1865 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1866
1867 size_t Device::image3DMaxHeight() const
1868 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1869
1870 size_t Device::image3DMaxDepth() const
1871 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1872
1873 size_t Device::imageMaxBufferSize() const
1874 #ifdef CL_VERSION_1_2
1875 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1876 #else
1877 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1878 #endif
1879
1880 size_t Device::imageMaxArraySize() const
1881 #ifdef CL_VERSION_1_2
1882 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1883 #else
1884 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1885 #endif
1886
1887 bool Device::intelSubgroupsSupport() const
1888 { return p ? p->intelSubgroupsSupport_ : false; }
1889
1890 int Device::maxClockFrequency() const
1891 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1892
1893 int Device::maxComputeUnits() const
1894 { return p ? p->maxComputeUnits_ : 0; }
1895
1896 int Device::maxConstantArgs() const
1897 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1898
1899 size_t Device::maxConstantBufferSize() const
1900 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1901
1902 size_t Device::maxMemAllocSize() const
1903 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1904
1905 size_t Device::maxParameterSize() const
1906 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1907
1908 int Device::maxReadImageArgs() const
1909 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
1910
1911 int Device::maxWriteImageArgs() const
1912 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
1913
1914 int Device::maxSamplers() const
1915 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
1916
1917 size_t Device::maxWorkGroupSize() const
1918 { return p ? p->maxWorkGroupSize_ : 0; }
1919
1920 int Device::maxWorkItemDims() const
1921 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
1922
1923 void Device::maxWorkItemSizes(size_t* sizes) const
1924 {
1925     if(p)
1926     {
1927         const int MAX_DIMS = 32;
1928         size_t retsz = 0;
1929         CV_OCL_DBG_CHECK(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
1930                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz));
1931     }
1932 }
1933
1934 int Device::memBaseAddrAlign() const
1935 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
1936
1937 int Device::nativeVectorWidthChar() const
1938 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
1939
1940 int Device::nativeVectorWidthShort() const
1941 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
1942
1943 int Device::nativeVectorWidthInt() const
1944 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
1945
1946 int Device::nativeVectorWidthLong() const
1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
1948
1949 int Device::nativeVectorWidthFloat() const
1950 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
1951
1952 int Device::nativeVectorWidthDouble() const
1953 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
1954
1955 int Device::nativeVectorWidthHalf() const
1956 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
1957
1958 int Device::preferredVectorWidthChar() const
1959 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
1960
1961 int Device::preferredVectorWidthShort() const
1962 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
1963
1964 int Device::preferredVectorWidthInt() const
1965 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
1966
1967 int Device::preferredVectorWidthLong() const
1968 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
1969
1970 int Device::preferredVectorWidthFloat() const
1971 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
1972
1973 int Device::preferredVectorWidthDouble() const
1974 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
1975
1976 int Device::preferredVectorWidthHalf() const
1977 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
1978
1979 size_t Device::printfBufferSize() const
1980 #ifdef CL_VERSION_1_2
1981 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
1982 #else
1983 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1984 #endif
1985
1986
1987 size_t Device::profilingTimerResolution() const
1988 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
1989
1990 const Device& Device::getDefault()
1991 {
1992     auto& c = OpenCLExecutionContext::getCurrent();
1993     if (!c.empty())
1994     {
1995         return c.getDevice();
1996     }
1997
1998     static Device dummy;
1999     return dummy;
2000 }
2001
2002 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2003
2004 template <typename Functor, typename ObjectType>
2005 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2006 {
2007     ::size_t required;
2008     cl_int err = f(obj, name, 0, NULL, &required);
2009     if (err != CL_SUCCESS)
2010         return err;
2011
2012     param.clear();
2013     if (required > 0)
2014     {
2015         AutoBuffer<char> buf(required + 1);
2016         char* ptr = buf.data(); // cleanup is not needed
2017         err = f(obj, name, required, ptr, NULL);
2018         if (err != CL_SUCCESS)
2019             return err;
2020         param = ptr;
2021     }
2022
2023     return CL_SUCCESS;
2024 }
2025
2026 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2027 {
2028     elems.clear();
2029     if (s.size() == 0)
2030         return;
2031     std::istringstream ss(s);
2032     std::string item;
2033     while (!ss.eof())
2034     {
2035         std::getline(ss, item, delim);
2036         elems.push_back(item);
2037     }
2038 }
2039
2040 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2041 // Sample: AMD:GPU:
2042 // Sample: AMD:GPU:Tahiti
2043 // Sample: :GPU|CPU: = '' = ':' = '::'
2044 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2045         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2046 {
2047     std::vector<std::string> parts;
2048     split(configurationStr, ':', parts);
2049     if (parts.size() > 3)
2050     {
2051         CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
2052         return false;
2053     }
2054     if (parts.size() > 2)
2055         deviceNameOrID = parts[2];
2056     if (parts.size() > 1)
2057     {
2058         split(parts[1], '|', deviceTypes);
2059     }
2060     if (parts.size() > 0)
2061     {
2062         platform = parts[0];
2063     }
2064     return true;
2065 }
2066
2067 #if defined WINRT || defined _WIN32_WCE
2068 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2069 {
2070     CV_UNUSED(configuration)
2071     return NULL;
2072 }
2073 #else
2074 static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
2075 {
2076     std::string platform, deviceName;
2077     std::vector<std::string> deviceTypes;
2078
2079     if (!configuration)
2080         configuration = getenv("OPENCV_OPENCL_DEVICE");
2081
2082     if (configuration &&
2083             (strcmp(configuration, "disabled") == 0 ||
2084              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2085             ))
2086         return NULL;
2087
2088     bool isID = false;
2089     int deviceID = -1;
2090     if (deviceName.length() == 1)
2091     // We limit ID range to 0..9, because we want to write:
2092     // - '2500' to mean i5-2500
2093     // - '8350' to mean AMD FX-8350
2094     // - '650' to mean GeForce 650
2095     // To extend ID range change condition to '> 0'
2096     {
2097         isID = true;
2098         for (size_t i = 0; i < deviceName.length(); i++)
2099         {
2100             if (!isdigit(deviceName[i]))
2101             {
2102                 isID = false;
2103                 break;
2104             }
2105         }
2106         if (isID)
2107         {
2108             deviceID = atoi(deviceName.c_str());
2109             if (deviceID < 0)
2110                 return NULL;
2111         }
2112     }
2113
2114     std::vector<cl_platform_id> platforms;
2115     {
2116         cl_uint numPlatforms = 0;
2117         CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
2118
2119         if (numPlatforms == 0)
2120             return NULL;
2121         platforms.resize((size_t)numPlatforms);
2122         CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
2123         platforms.resize(numPlatforms);
2124     }
2125
2126     int selectedPlatform = -1;
2127     if (platform.length() > 0)
2128     {
2129         for (size_t i = 0; i < platforms.size(); i++)
2130         {
2131             std::string name;
2132             CV_OCL_DBG_CHECK(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name));
2133             if (name.find(platform) != std::string::npos)
2134             {
2135                 selectedPlatform = (int)i;
2136                 break;
2137             }
2138         }
2139         if (selectedPlatform == -1)
2140         {
2141             CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
2142             goto not_found;
2143         }
2144     }
2145     if (deviceTypes.size() == 0)
2146     {
2147         if (!isID)
2148         {
2149             deviceTypes.push_back("GPU");
2150             if (configuration)
2151                 deviceTypes.push_back("CPU");
2152         }
2153         else
2154             deviceTypes.push_back("ALL");
2155     }
2156     for (size_t t = 0; t < deviceTypes.size(); t++)
2157     {
2158         int deviceType = 0;
2159         std::string tempStrDeviceType = deviceTypes[t];
2160         std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
2161
2162         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2163             deviceType = Device::TYPE_GPU;
2164         else if (tempStrDeviceType == "cpu")
2165             deviceType = Device::TYPE_CPU;
2166         else if (tempStrDeviceType == "accelerator")
2167             deviceType = Device::TYPE_ACCELERATOR;
2168         else if (tempStrDeviceType == "all")
2169             deviceType = Device::TYPE_ALL;
2170         else
2171         {
2172             CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
2173             goto not_found;
2174         }
2175
2176         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2177         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2178                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2179                 i++)
2180         {
2181             cl_uint count = 0;
2182             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2183             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2184             {
2185                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get count");
2186             }
2187             if (count == 0)
2188                 continue;
2189             size_t base = devices.size();
2190             devices.resize(base + count);
2191             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2192             if (!(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND))
2193             {
2194                 CV_OCL_DBG_CHECK_RESULT(status, "clGetDeviceIDs get IDs");
2195             }
2196         }
2197
2198         for (size_t i = (isID ? deviceID : 0);
2199              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2200              i++)
2201         {
2202             std::string name;
2203             CV_OCL_DBG_CHECK(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name));
2204             cl_bool useGPU = true;
2205             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2206             {
2207                 cl_bool isIGPU = CL_FALSE;
2208                 CV_OCL_DBG_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL));
2209                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2210             }
2211             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2212             {
2213                 // TODO check for OpenCL 1.1
2214                 return devices[i];
2215             }
2216         }
2217     }
2218
2219 not_found:
2220     if (!configuration)
2221         return NULL; // suppress messages on stderr
2222
2223     std::ostringstream msg;
2224     msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
2225         << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2226         << "    Device types:";
2227     for (size_t t = 0; t < deviceTypes.size(); t++)
2228         msg << ' ' << deviceTypes[t];
2229
2230     msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
2231
2232     CV_LOG_ERROR(NULL, msg.str());
2233     return NULL;
2234 }
2235 #endif
2236
2237 #ifdef HAVE_OPENCL_SVM
2238 namespace svm {
2239
2240 enum AllocatorFlags { // don't use first 16 bits
2241         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
2242         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
2243         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
2244         OPENCL_SVM_BUFFER_MASK = 3 << 16,
2245         OPENCL_SVM_BUFFER_MAP = 4 << 16
2246 };
2247
2248 static bool checkForceSVMUmatUsage()
2249 {
2250     static bool initialized = false;
2251     static bool force = false;
2252     if (!initialized)
2253     {
2254         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
2255         initialized = true;
2256     }
2257     return force;
2258 }
2259 static bool checkDisableSVMUMatUsage()
2260 {
2261     static bool initialized = false;
2262     static bool force = false;
2263     if (!initialized)
2264     {
2265         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
2266         initialized = true;
2267     }
2268     return force;
2269 }
2270 static bool checkDisableSVM()
2271 {
2272     static bool initialized = false;
2273     static bool force = false;
2274     if (!initialized)
2275     {
2276         force = utils::getConfigurationParameterBool("OPENCV_OPENCL_SVM_DISABLE", false);
2277         initialized = true;
2278     }
2279     return force;
2280 }
2281 // see SVMCapabilities
2282 static unsigned int getSVMCapabilitiesMask()
2283 {
2284     static bool initialized = false;
2285     static unsigned int mask = 0;
2286     if (!initialized)
2287     {
2288         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
2289         if (envValue == NULL)
2290         {
2291             return ~0U; // all bits 1
2292         }
2293         mask = atoi(envValue);
2294         initialized = true;
2295     }
2296     return mask;
2297 }
2298 } // namespace
2299 #endif
2300
2301 static size_t getProgramCountLimit()
2302 {
2303     static bool initialized = false;
2304     static size_t count = 0;
2305     if (!initialized)
2306     {
2307         count = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_PROGRAM_CACHE", 0);
2308         initialized = true;
2309     }
2310     return count;
2311 }
2312
2313 static int g_contextId = 0;
2314
2315 class OpenCLBufferPoolImpl;
2316 class OpenCLSVMBufferPoolImpl;
2317
2318 struct Context::Impl
2319 {
2320     static Context::Impl* get(Context& context) { return context.p; }
2321
2322     typedef std::deque<Context::Impl*> container_t;
2323     static container_t& getGlobalContainer()
2324     {
2325         // never delete this container (Impl lifetime is greater due to TLS storage)
2326         static container_t* g_contexts = new container_t();
2327         return *g_contexts;
2328     }
2329
2330 protected:
2331     Impl(const std::string& configuration_)
2332         : refcount(1)
2333         , contextId(CV_XADD(&g_contextId, 1))
2334         , configuration(configuration_)
2335         , handle(0)
2336 #ifdef HAVE_DIRECTX
2337         , p_directx_impl(0)
2338 #endif
2339 #ifdef HAVE_OPENCL_SVM
2340         , svmInitialized(false)
2341 #endif
2342     {
2343         if (!haveOpenCL())
2344             CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
2345
2346         cv::AutoLock lock(cv::getInitializationMutex());
2347         auto& container = getGlobalContainer();
2348         container.resize(std::max(container.size(), (size_t)contextId + 1));
2349         container[contextId] = this;
2350     }
2351
2352     ~Impl()
2353     {
2354 #ifdef _WIN32
2355         if (!cv::__termination)
2356 #endif
2357         {
2358             if (handle)
2359             {
2360                 CV_OCL_DBG_CHECK(clReleaseContext(handle));
2361                 handle = NULL;
2362             }
2363             devices.clear();
2364 #ifdef HAVE_DIRECTX
2365             directx::internal::deleteDirectXImpl(&p_directx_impl);
2366 #endif
2367         }
2368
2369         {
2370             cv::AutoLock lock(cv::getInitializationMutex());
2371             auto& container = getGlobalContainer();
2372             CV_CheckLT((size_t)contextId, container.size(), "");
2373             container[contextId] = NULL;
2374         }
2375     }
2376
2377     void init_device_list()
2378     {
2379         CV_Assert(handle);
2380
2381         cl_uint ndevices = 0;
2382         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
2383         CV_Assert(ndevices > 0);
2384
2385         cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
2386         size_t devices_ret_size = 0;
2387         CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
2388         CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
2389
2390         devices.clear();
2391         for (unsigned i = 0; i < ndevices; i++)
2392         {
2393             devices.emplace_back(Device::fromHandle(cl_devices[i]));
2394         }
2395     }
2396
2397     void __init_buffer_pools();  // w/o synchronization
2398     void _init_buffer_pools() const
2399     {
2400         if (!bufferPool_)
2401         {
2402             cv::AutoLock lock(cv::getInitializationMutex());
2403             if (!bufferPool_)
2404             {
2405                 const_cast<Impl*>(this)->__init_buffer_pools();
2406             }
2407         }
2408     }
2409 public:
2410     static Impl* findContext(const std::string& configuration)
2411     {
2412         CV_TRACE_FUNCTION();
2413         cv::AutoLock lock(cv::getInitializationMutex());
2414         auto& container = getGlobalContainer();
2415         if (configuration.empty() && !container.empty())
2416             return container[0];
2417         for (auto it = container.begin(); it != container.end(); ++it)
2418         {
2419             Impl* i = *it;
2420             if (i && i->configuration == configuration)
2421             {
2422                 return i;
2423             }
2424         }
2425         return NULL;
2426     }
2427
2428     static Impl* findOrCreateContext(const std::string& configuration_)
2429     {
2430         CV_TRACE_FUNCTION();
2431         std::string configuration = configuration_;
2432         if (configuration_.empty())
2433         {
2434             const char* c = getenv("OPENCV_OPENCL_DEVICE");
2435             if (c)
2436                 configuration = c;
2437         }
2438         Impl* impl = findContext(configuration);
2439         if (impl)
2440         {
2441             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2442             impl->addref();
2443             return impl;
2444         }
2445
2446         cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
2447         if (d == NULL)
2448             return NULL;
2449
2450         impl = new Impl(configuration);
2451         try
2452         {
2453             impl->createFromDevice(d);
2454             if (impl->handle)
2455                 return impl;
2456             delete impl;
2457             return NULL;
2458         }
2459         catch (...)
2460         {
2461             delete impl;
2462             throw;
2463         }
2464     }
2465
2466     static Impl* findOrCreateContext(cl_context h)
2467     {
2468         CV_TRACE_FUNCTION();
2469
2470         CV_Assert(h);
2471
2472         std::string configuration = cv::format("@ctx-%p", (void*)h);
2473         Impl* impl = findContext(configuration);
2474         if (impl)
2475         {
2476             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2477             impl->addref();
2478             return impl;
2479         }
2480
2481         impl = new Impl(configuration);
2482         try
2483         {
2484             CV_OCL_CHECK(clRetainContext(h));
2485             impl->handle = h;
2486             impl->init_device_list();
2487             return impl;
2488         }
2489         catch (...)
2490         {
2491             delete impl;
2492             throw;
2493         }
2494     }
2495
2496     static Impl* findOrCreateContext(const ocl::Device& device)
2497     {
2498         CV_TRACE_FUNCTION();
2499
2500         CV_Assert(!device.empty());
2501         cl_device_id d = (cl_device_id)device.ptr();
2502         CV_Assert(d);
2503
2504         std::string configuration = cv::format("@dev-%p", (void*)d);
2505         Impl* impl = findContext(configuration);
2506         if (impl)
2507         {
2508             CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
2509             impl->addref();
2510             return impl;
2511         }
2512
2513         impl = new Impl(configuration);
2514         try
2515         {
2516             impl->createFromDevice(d);
2517             CV_Assert(impl->handle);
2518             return impl;
2519         }
2520         catch (...)
2521         {
2522             delete impl;
2523             throw;
2524         }
2525     }
2526
2527     void setDefault()
2528     {
2529         CV_TRACE_FUNCTION();
2530         cl_device_id d = selectOpenCLDevice();
2531
2532         if (d == NULL)
2533             return;
2534
2535         createFromDevice(d);
2536     }
2537
2538     void createFromDevice(cl_device_id d)
2539     {
2540         CV_TRACE_FUNCTION();
2541         CV_Assert(handle == NULL);
2542
2543         cl_platform_id pl = NULL;
2544         CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
2545
2546         cl_context_properties prop[] =
2547         {
2548             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2549             0
2550         };
2551
2552         // !!! in the current implementation force the number of devices to 1 !!!
2553         cl_uint nd = 1;
2554         cl_int status;
2555
2556         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2557         CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
2558
2559         bool ok = handle != 0 && status == CL_SUCCESS;
2560         if( ok )
2561         {
2562             devices.resize(nd);
2563             devices[0].set(d);
2564         }
2565         else
2566             handle = NULL;
2567     }
2568
2569     Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
2570
2571     void unloadProg(Program& prog)
2572     {
2573         cv::AutoLock lock(program_cache_mutex);
2574         for (CacheList::iterator i = cacheList.begin(); i != cacheList.end(); ++i)
2575         {
2576               phash_t::iterator it = phash.find(*i);
2577               if (it != phash.end())
2578               {
2579                   if (it->second.ptr() == prog.ptr())
2580                   {
2581                       phash.erase(*i);
2582                       cacheList.erase(i);
2583                       return;
2584                   }
2585               }
2586         }
2587     }
2588
2589     std::string& getPrefixString()
2590     {
2591         if (prefix.empty())
2592         {
2593             cv::AutoLock lock(program_cache_mutex);
2594             if (prefix.empty())
2595             {
2596                 CV_Assert(!devices.empty());
2597                 const Device& d = devices[0];
2598                 int bits = d.addressBits();
2599                 if (bits > 0 && bits != 64)
2600                     prefix = cv::format("%d-bit--", bits);
2601                 prefix += d.vendorName() + "--" + d.name() + "--" + d.driverVersion();
2602                 // sanitize chars
2603                 for (size_t i = 0; i < prefix.size(); i++)
2604                 {
2605                     char c = prefix[i];
2606                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2607                     {
2608                         prefix[i] = '_';
2609                     }
2610                 }
2611             }
2612         }
2613         return prefix;
2614     }
2615
2616     std::string& getPrefixBase()
2617     {
2618         if (prefix_base.empty())
2619         {
2620             cv::AutoLock lock(program_cache_mutex);
2621             if (prefix_base.empty())
2622             {
2623                 const Device& d = devices[0];
2624                 int bits = d.addressBits();
2625                 if (bits > 0 && bits != 64)
2626                     prefix_base = cv::format("%d-bit--", bits);
2627                 prefix_base += d.vendorName() + "--" + d.name() + "--";
2628                 // sanitize chars
2629                 for (size_t i = 0; i < prefix_base.size(); i++)
2630                 {
2631                     char c = prefix_base[i];
2632                     if (!((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-'))
2633                     {
2634                         prefix_base[i] = '_';
2635                     }
2636                 }
2637             }
2638         }
2639         return prefix_base;
2640     }
2641
2642     IMPLEMENT_REFCOUNTABLE();
2643
2644     const int contextId;  // global unique ID
2645     const std::string configuration;
2646
2647     cl_context handle;
2648     std::vector<Device> devices;
2649
2650     std::string prefix;
2651     std::string prefix_base;
2652
2653     cv::Mutex program_cache_mutex;
2654     typedef std::map<std::string, Program> phash_t;
2655     phash_t phash;
2656     typedef std::list<cv::String> CacheList;
2657     CacheList cacheList;
2658
2659     std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
2660     std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
2661     OpenCLBufferPoolImpl& getBufferPool() const
2662     {
2663         _init_buffer_pools();
2664         CV_DbgAssert(bufferPool_);
2665         return *bufferPool_.get();
2666     }
2667     OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
2668     {
2669         _init_buffer_pools();
2670         CV_DbgAssert(bufferPoolHostPtr_);
2671         return *bufferPoolHostPtr_.get();
2672     }
2673
2674 #ifdef HAVE_DIRECTX
2675     directx::internal::OpenCLDirectXImpl* p_directx_impl;
2676
2677     directx::internal::OpenCLDirectXImpl* getDirectXImpl()
2678     {
2679         if (!p_directx_impl)
2680         {
2681             p_directx_impl = directx::internal::createDirectXImpl();
2682         }
2683         return p_directx_impl;
2684     }
2685 #endif
2686
2687 #ifdef HAVE_OPENCL_SVM
2688     bool svmInitialized;
2689     bool svmAvailable;
2690     bool svmEnabled;
2691     svm::SVMCapabilities svmCapabilities;
2692     svm::SVMFunctions svmFunctions;
2693
2694     void svmInit()
2695     {
2696         CV_Assert(handle != NULL);
2697         const Device& device = devices[0];
2698         cl_device_svm_capabilities deviceCaps = 0;
2699         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
2700         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
2701         if (status != CL_SUCCESS)
2702         {
2703             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
2704             goto noSVM;
2705         }
2706         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
2707         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
2708         svmCapabilities.value_ =
2709                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
2710                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
2711                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
2712                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
2713         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
2714         if (svmCapabilities.value_ == 0)
2715         {
2716             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
2717             goto noSVM;
2718         }
2719         try
2720         {
2721             // Try OpenCL 2.0
2722             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
2723             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
2724             if (!ptr)
2725             {
2726                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
2727                 CV_Error(Error::StsBadArg, "clSVMAlloc returned NULL");
2728             }
2729             try
2730             {
2731                 bool error = false;
2732                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
2733                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
2734                 {
2735                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
2736                     CV_Error(Error::StsBadArg, "clEnqueueSVMMap FAILED");
2737                 }
2738                 clFinish(q);
2739                 try
2740                 {
2741                     ((int*)ptr)[0] = 100;
2742                 }
2743                 catch (...)
2744                 {
2745                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
2746                     error = true;
2747                 }
2748                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
2749                 {
2750                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
2751                     CV_Error(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
2752                 }
2753                 clFinish(q);
2754                 if (error)
2755                 {
2756                     CV_Error(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
2757                 }
2758             }
2759             catch (...)
2760             {
2761                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
2762                 clSVMFree(handle, ptr);
2763                 throw;
2764             }
2765             clSVMFree(handle, ptr);
2766             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
2767             svmFunctions.fn_clSVMFree = clSVMFree;
2768             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
2769             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
2770             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
2771             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
2772             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
2773             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
2774             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
2775         }
2776         catch (...)
2777         {
2778             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
2779             try
2780             {
2781                 // Try HSA extension
2782                 String extensions = device.extensions();
2783                 if (extensions.find("cl_amd_svm") == String::npos)
2784                 {
2785                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
2786                     goto noSVM;
2787                 }
2788                 cl_platform_id p = NULL;
2789                 CV_OCL_CHECK(status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL));
2790                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
2791                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
2792                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
2793                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
2794                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
2795                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
2796                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
2797                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
2798                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
2799                 CV_Assert(svmFunctions.isValid());
2800             }
2801             catch (...)
2802             {
2803                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
2804                 goto noSVM;
2805             }
2806         }
2807
2808         svmAvailable = true;
2809         svmEnabled = !svm::checkDisableSVM();
2810         svmInitialized = true;
2811         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
2812         return;
2813     noSVM:
2814         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
2815         svmAvailable = false;
2816         svmEnabled = false;
2817         svmCapabilities.value_ = 0;
2818         svmInitialized = true;
2819         svmFunctions.fn_clSVMAlloc = NULL;
2820         return;
2821     }
2822
2823     std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
2824
2825     OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
2826     {
2827         _init_buffer_pools();
2828         CV_DbgAssert(bufferPoolSVM_);
2829         return *bufferPoolSVM_.get();
2830     }
2831 #endif
2832
2833     friend class Program;
2834 };
2835
2836
2837 Context::Context()
2838 {
2839     p = 0;
2840 }
2841
2842 Context::~Context()
2843 {
2844     release();
2845 }
2846
2847 // deprecated
2848 Context::Context(int dtype)
2849 {
2850     p = 0;
2851     create(dtype);
2852 }
2853
2854 void Context::release()
2855 {
2856     if (p)
2857     {
2858         p->release();
2859         p = NULL;
2860     }
2861 }
2862
2863 bool Context::create()
2864 {
2865     release();
2866     if (!haveOpenCL())
2867         return false;
2868     p = Impl::findOrCreateContext(std::string());
2869     if (p && p->handle)
2870         return true;
2871     release();
2872     return false;
2873 }
2874
2875 // deprecated
2876 bool Context::create(int dtype)
2877 {
2878     if( !haveOpenCL() )
2879         return false;
2880     release();
2881     if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
2882     {
2883         p = Impl::findOrCreateContext("");
2884     }
2885     else if (dtype == CL_DEVICE_TYPE_GPU)
2886     {
2887         p = Impl::findOrCreateContext(":GPU:");
2888     }
2889     else if (dtype == CL_DEVICE_TYPE_CPU)
2890     {
2891         p = Impl::findOrCreateContext(":CPU:");
2892     }
2893     else
2894     {
2895         CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
2896     }
2897     if (p && !p->handle)
2898     {
2899         release();
2900     }
2901     return p != 0;
2902 }
2903
2904 Context::Context(const Context& c)
2905 {
2906     p = (Impl*)c.p;
2907     if(p)
2908         p->addref();
2909 }
2910
2911 Context& Context::operator = (const Context& c)
2912 {
2913     Impl* newp = (Impl*)c.p;
2914     if(newp)
2915         newp->addref();
2916     if(p)
2917         p->release();
2918     p = newp;
2919     return *this;
2920 }
2921
2922 void* Context::ptr() const
2923 {
2924     return p == NULL ? NULL : p->handle;
2925 }
2926
2927 size_t Context::ndevices() const
2928 {
2929     return p ? p->devices.size() : 0;
2930 }
2931
2932 Device& Context::device(size_t idx) const
2933 {
2934     static Device dummy;
2935     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2936 }
2937
2938 Context& Context::getDefault(bool initialize)
2939 {
2940     auto& c = OpenCLExecutionContext::getCurrent();
2941     if (!c.empty())
2942     {
2943         auto& ctx = c.getContext();
2944         return ctx;
2945     }
2946
2947     CV_UNUSED(initialize);
2948     static Context dummy;
2949     return dummy;
2950 }
2951
2952 Program Context::getProg(const ProgramSource& prog,
2953                          const String& buildopts, String& errmsg)
2954 {
2955     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2956 }
2957
2958 void Context::unloadProg(Program& prog)
2959 {
2960     if (p)
2961         p->unloadProg(prog);
2962 }
2963
2964 /* static */
2965 Context Context::fromHandle(void* context)
2966 {
2967     Context ctx;
2968     ctx.p = Impl::findOrCreateContext((cl_context)context);
2969     return ctx;
2970 }
2971
2972 /* static */
2973 Context Context::fromDevice(const ocl::Device& device)
2974 {
2975     Context ctx;
2976     ctx.p = Impl::findOrCreateContext(device);
2977     return ctx;
2978 }
2979
2980 /* static */
2981 Context Context::create(const std::string& configuration)
2982 {
2983     Context ctx;
2984     ctx.p = Impl::findOrCreateContext(configuration);
2985     return ctx;
2986 }
2987
2988 #ifdef HAVE_OPENCL_SVM
2989 bool Context::useSVM() const
2990 {
2991     Context::Impl* i = p;
2992     CV_Assert(i);
2993     if (!i->svmInitialized)
2994         i->svmInit();
2995     return i->svmEnabled;
2996 }
2997 void Context::setUseSVM(bool enabled)
2998 {
2999     Context::Impl* i = p;
3000     CV_Assert(i);
3001     if (!i->svmInitialized)
3002         i->svmInit();
3003     if (enabled && !i->svmAvailable)
3004     {
3005         CV_Error(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
3006     }
3007     i->svmEnabled = enabled;
3008 }
3009 #else
3010 bool Context::useSVM() const { return false; }
3011 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
3012 #endif
3013
3014 #ifdef HAVE_OPENCL_SVM
3015 namespace svm {
3016
3017 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
3018 {
3019     Context::Impl* i = context.p;
3020     CV_Assert(i);
3021     if (!i->svmInitialized)
3022         i->svmInit();
3023     return i->svmCapabilities;
3024 }
3025
3026 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
3027 {
3028     Context::Impl* i = context.p;
3029     CV_Assert(i);
3030     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
3031     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
3032     return &i->svmFunctions;
3033 }
3034
3035 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
3036 {
3037     if (checkForceSVMUmatUsage())
3038         return true;
3039     if (checkDisableSVMUMatUsage())
3040         return false;
3041     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
3042         return true;
3043     return false; // don't use SVM by default
3044 }
3045
3046 } // namespace cv::ocl::svm
3047 #endif // HAVE_OPENCL_SVM
3048
3049
3050 static void get_platform_name(cl_platform_id id, String& name)
3051 {
3052     // get platform name string length
3053     size_t sz = 0;
3054     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, 0, 0, &sz));
3055
3056     // get platform name string
3057     AutoBuffer<char> buf(sz + 1);
3058     CV_OCL_CHECK(clGetPlatformInfo(id, CL_PLATFORM_NAME, sz, buf.data(), 0));
3059
3060     // just in case, ensure trailing zero for ASCIIZ string
3061     buf[sz] = 0;
3062
3063     name = buf.data();
3064 }
3065
3066 /*
3067 // Attaches OpenCL context to OpenCV
3068 */
3069 void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
3070 {
3071     auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3072     ctx.bind();
3073 }
3074
3075 /* static */
3076 OpenCLExecutionContext OpenCLExecutionContext::create(
3077         const std::string& platformName, void* platformID, void* context, void* deviceID
3078 )
3079 {
3080     if (!haveOpenCL())
3081         CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
3082
3083     cl_uint cnt = 0;
3084     CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
3085
3086     if (cnt == 0)
3087         CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
3088
3089     std::vector<cl_platform_id> platforms(cnt);
3090
3091     CV_OCL_CHECK(clGetPlatformIDs(cnt, &platforms[0], 0));
3092
3093     bool platformAvailable = false;
3094
3095     // check if external platformName contained in list of available platforms in OpenCV
3096     for (unsigned int i = 0; i < cnt; i++)
3097     {
3098         String availablePlatformName;
3099         get_platform_name(platforms[i], availablePlatformName);
3100         // external platform is found in the list of available platforms
3101         if (platformName == availablePlatformName)
3102         {
3103             platformAvailable = true;
3104             break;
3105         }
3106     }
3107
3108     if (!platformAvailable)
3109         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3110
3111     // check if platformID corresponds to platformName
3112     String actualPlatformName;
3113     get_platform_name((cl_platform_id)platformID, actualPlatformName);
3114     if (platformName != actualPlatformName)
3115         CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
3116
3117     OpenCLExecutionContext ctx;
3118     ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
3119     CV_OCL_CHECK(clReleaseContext((cl_context)context));
3120     CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
3121     return ctx;
3122 }
3123
3124 void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
3125 {
3126     // internal call, less checks
3127     cl_platform_id platformID = (cl_platform_id)_platform;
3128     cl_context context = (cl_context)_context;
3129     cl_device_id deviceID = (cl_device_id)_device;
3130
3131     std::string platformName = PlatformInfo(&platformID).name();
3132
3133     auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
3134     CV_Assert(!clExecCtx.empty());
3135     ctx = clExecCtx.getContext();
3136 }
3137
3138 /////////////////////////////////////////// Queue /////////////////////////////////////////////
3139
3140 struct Queue::Impl
3141 {
3142     inline void __init()
3143     {
3144         refcount = 1;
3145         handle = 0;
3146         isProfilingQueue_ = false;
3147     }
3148
3149     Impl(cl_command_queue q)
3150     {
3151         __init();
3152         handle = q;
3153
3154         cl_command_queue_properties props = 0;
3155         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &props, NULL));
3156         isProfilingQueue_ = !!(props & CL_QUEUE_PROFILING_ENABLE);
3157     }
3158
3159     Impl(cl_command_queue q, bool isProfilingQueue)
3160     {
3161         __init();
3162         handle = q;
3163         isProfilingQueue_ = isProfilingQueue;
3164     }
3165
3166     Impl(const Context& c, const Device& d, bool withProfiling = false)
3167     {
3168         __init();
3169
3170         const Context* pc = &c;
3171         cl_context ch = (cl_context)pc->ptr();
3172         if( !ch )
3173         {
3174             pc = &Context::getDefault();
3175             ch = (cl_context)pc->ptr();
3176         }
3177         cl_device_id dh = (cl_device_id)d.ptr();
3178         if( !dh )
3179             dh = (cl_device_id)pc->device(0).ptr();
3180         cl_int retval = 0;
3181         cl_command_queue_properties props = withProfiling ? CL_QUEUE_PROFILING_ENABLE : 0;
3182         CV_OCL_DBG_CHECK_(handle = clCreateCommandQueue(ch, dh, props, &retval), retval);
3183         isProfilingQueue_ = withProfiling;
3184     }
3185
3186     ~Impl()
3187     {
3188 #ifdef _WIN32
3189         if (!cv::__termination)
3190 #endif
3191         {
3192             if(handle)
3193             {
3194                 CV_OCL_DBG_CHECK(clFinish(handle));
3195                 CV_OCL_DBG_CHECK(clReleaseCommandQueue(handle));
3196                 handle = NULL;
3197             }
3198         }
3199     }
3200
3201     const cv::ocl::Queue& getProfilingQueue(const cv::ocl::Queue& self)
3202     {
3203         if (isProfilingQueue_)
3204             return self;
3205
3206         if (profiling_queue_.ptr())
3207             return profiling_queue_;
3208
3209         cl_context ctx = 0;
3210         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL));
3211
3212         cl_device_id device = 0;
3213         CV_OCL_CHECK(clGetCommandQueueInfo(handle, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device, NULL));
3214
3215         cl_int result = CL_SUCCESS;
3216         cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE;
3217         cl_command_queue q = clCreateCommandQueue(ctx, device, props, &result);
3218         CV_OCL_DBG_CHECK_RESULT(result, "clCreateCommandQueue(with CL_QUEUE_PROFILING_ENABLE)");
3219
3220         Queue queue;
3221         queue.p = new Impl(q, true);
3222         profiling_queue_ = queue;
3223
3224         return profiling_queue_;
3225     }
3226
3227     IMPLEMENT_REFCOUNTABLE();
3228
3229     cl_command_queue handle;
3230     bool isProfilingQueue_;
3231     cv::ocl::Queue profiling_queue_;
3232 };
3233
3234 Queue::Queue()
3235 {
3236     p = 0;
3237 }
3238
3239 Queue::Queue(const Context& c, const Device& d)
3240 {
3241     p = 0;
3242     create(c, d);
3243 }
3244
3245 Queue::Queue(const Queue& q)
3246 {
3247     p = q.p;
3248     if(p)
3249         p->addref();
3250 }
3251
3252 Queue& Queue::operator = (const Queue& q)
3253 {
3254     Impl* newp = (Impl*)q.p;
3255     if(newp)
3256         newp->addref();
3257     if(p)
3258         p->release();
3259     p = newp;
3260     return *this;
3261 }
3262
3263 Queue::~Queue()
3264 {
3265     if(p)
3266         p->release();
3267 }
3268
3269 bool Queue::create(const Context& c, const Device& d)
3270 {
3271     if(p)
3272         p->release();
3273     p = new Impl(c, d);
3274     return p->handle != 0;
3275 }
3276
3277 void Queue::finish()
3278 {
3279     if(p && p->handle)
3280     {
3281         CV_OCL_DBG_CHECK(clFinish(p->handle));
3282     }
3283 }
3284
3285 const Queue& Queue::getProfilingQueue() const
3286 {
3287     CV_Assert(p);
3288     return p->getProfilingQueue(*this);
3289 }
3290
3291 void* Queue::ptr() const
3292 {
3293     return p ? p->handle : 0;
3294 }
3295
3296 Queue& Queue::getDefault()
3297 {
3298     auto& c = OpenCLExecutionContext::getCurrent();
3299     if (!c.empty())
3300     {
3301         auto& q = c.getQueue();
3302         return q;
3303     }
3304     static Queue dummy;
3305     return dummy;
3306 }
3307
3308 static cl_command_queue getQueue(const Queue& q)
3309 {
3310     cl_command_queue qq = (cl_command_queue)q.ptr();
3311     if(!qq)
3312         qq = (cl_command_queue)Queue::getDefault().ptr();
3313     return qq;
3314 }
3315
3316 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
3317
3318 KernelArg::KernelArg()
3319     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
3320 {
3321 }
3322
3323 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
3324     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
3325 {
3326     CV_Assert(_flags == LOCAL || _flags == CONSTANT || _m != NULL);
3327 }
3328
3329 KernelArg KernelArg::Constant(const Mat& m)
3330 {
3331     CV_Assert(m.isContinuous());
3332     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
3333 }
3334
3335 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
3336
3337 struct Kernel::Impl
3338 {
3339     Impl(const char* kname, const Program& prog) :
3340         refcount(1), handle(NULL), isInProgress(false), isAsyncRun(false), nu(0)
3341     {
3342         cl_program ph = (cl_program)prog.ptr();
3343         cl_int retval = 0;
3344         name = kname;
3345         if (ph)
3346         {
3347             handle = clCreateKernel(ph, kname, &retval);
3348             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateKernel('%s')", kname).c_str());
3349         }
3350         for( int i = 0; i < MAX_ARRS; i++ )
3351             u[i] = 0;
3352         haveTempDstUMats = false;
3353         haveTempSrcUMats = false;
3354     }
3355
3356     void cleanupUMats()
3357     {
3358         for( int i = 0; i < MAX_ARRS; i++ )
3359             if( u[i] )
3360             {
3361                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
3362                 {
3363                     u[i]->flags |= UMatData::ASYNC_CLEANUP;
3364                     u[i]->currAllocator->deallocate(u[i]);
3365                 }
3366                 u[i] = 0;
3367             }
3368         nu = 0;
3369         haveTempDstUMats = false;
3370         haveTempSrcUMats = false;
3371     }
3372
3373     void addUMat(const UMat& m, bool dst)
3374     {
3375         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
3376         u[nu] = m.u;
3377         CV_XADD(&m.u->urefcount, 1);
3378         nu++;
3379         if(dst && m.u->tempUMat())
3380             haveTempDstUMats = true;
3381         if(m.u->originalUMatData == NULL && m.u->tempUMat())
3382             haveTempSrcUMats = true;  // UMat is created on RAW memory (without proper lifetime management, even from Mat)
3383     }
3384
3385     /// Preserve image lifetime (while it is specified as Kernel argument)
3386     void registerImageArgument(int arg, const Image2D& image)
3387     {
3388         CV_CheckGE(arg, 0, "");
3389         CV_CheckLT(arg, (int)MAX_ARRS, "");
3390         if (arg < (int)shadow_images.size() && shadow_images[arg].ptr() != image.ptr())  // TODO future: replace ptr => impl (more strong check)
3391         {
3392             CV_Check(arg, !isInProgress, "ocl::Kernel: clearing of pending Image2D arguments is not allowed");
3393         }
3394         shadow_images.reserve(MAX_ARRS);
3395         shadow_images.resize(std::max(shadow_images.size(), (size_t)arg + 1));
3396         shadow_images[arg] = image;
3397     }
3398
3399     void finit(cl_event e)
3400     {
3401         CV_UNUSED(e);
3402         cleanupUMats();
3403         isInProgress = false;
3404         release();
3405     }
3406
3407     bool run(int dims, size_t _globalsize[], size_t _localsize[],
3408             bool sync, int64* timeNS, const Queue& q);
3409
3410     ~Impl()
3411     {
3412         if(handle)
3413         {
3414             CV_OCL_DBG_CHECK(clReleaseKernel(handle));
3415         }
3416     }
3417
3418     IMPLEMENT_REFCOUNTABLE();
3419
3420     cv::String name;
3421     cl_kernel handle;
3422     enum { MAX_ARRS = 16 };
3423     UMatData* u[MAX_ARRS];
3424     bool isInProgress;
3425     bool isAsyncRun;  // true if kernel was scheduled in async mode
3426     int nu;
3427     std::vector<Image2D> shadow_images;
3428     bool haveTempDstUMats;
3429     bool haveTempSrcUMats;
3430 };
3431
3432 }} // namespace cv::ocl
3433
3434 extern "C" {
3435
3436 static void CL_CALLBACK oclCleanupCallback(cl_event e, cl_int, void *p)
3437 {
3438     try
3439     {
3440         ((cv::ocl::Kernel::Impl*)p)->finit(e);
3441     }
3442     catch (const cv::Exception& exc)
3443     {
3444         CV_LOG_ERROR(NULL, "OCL: Unexpected OpenCV exception in OpenCL callback: " << exc.what());
3445     }
3446     catch (const std::exception& exc)
3447     {
3448         CV_LOG_ERROR(NULL, "OCL: Unexpected C++ exception in OpenCL callback: " << exc.what());
3449     }
3450     catch (...)
3451     {
3452         CV_LOG_ERROR(NULL, "OCL: Unexpected unknown C++ exception in OpenCL callback");
3453     }
3454 }
3455
3456 }
3457
3458 namespace cv { namespace ocl {
3459
3460 Kernel::Kernel()
3461 {
3462     p = 0;
3463 }
3464
3465 Kernel::Kernel(const char* kname, const Program& prog)
3466 {
3467     p = 0;
3468     create(kname, prog);
3469 }
3470
3471 Kernel::Kernel(const char* kname, const ProgramSource& src,
3472                const String& buildopts, String* errmsg)
3473 {
3474     p = 0;
3475     create(kname, src, buildopts, errmsg);
3476 }
3477
3478 Kernel::Kernel(const Kernel& k)
3479 {
3480     p = k.p;
3481     if(p)
3482         p->addref();
3483 }
3484
3485 Kernel& Kernel::operator = (const Kernel& k)
3486 {
3487     Impl* newp = (Impl*)k.p;
3488     if(newp)
3489         newp->addref();
3490     if(p)
3491         p->release();
3492     p = newp;
3493     return *this;
3494 }
3495
3496 Kernel::~Kernel()
3497 {
3498     if(p)
3499         p->release();
3500 }
3501
3502 bool Kernel::create(const char* kname, const Program& prog)
3503 {
3504     if(p)
3505         p->release();
3506     p = new Impl(kname, prog);
3507     if(p->handle == 0)
3508     {
3509         p->release();
3510         p = 0;
3511     }
3512 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
3513     CV_Assert(p);
3514 #endif
3515     return p != 0;
3516 }
3517
3518 bool Kernel::create(const char* kname, const ProgramSource& src,
3519                     const String& buildopts, String* errmsg)
3520 {
3521     if(p)
3522     {
3523         p->release();
3524         p = 0;
3525     }
3526     String tempmsg;
3527     if( !errmsg ) errmsg = &tempmsg;
3528     const Program prog = Context::getDefault().getProg(src, buildopts, *errmsg);
3529     return create(kname, prog);
3530 }
3531
3532 void* Kernel::ptr() const
3533 {
3534     return p ? p->handle : 0;
3535 }
3536
3537 bool Kernel::empty() const
3538 {
3539     return ptr() == 0;
3540 }
3541
3542 static cv::String dumpValue(size_t sz, const void* p)
3543 {
3544     if (sz == 4)
3545         return cv::format("%d / %uu / 0x%08x / %g", *(int*)p, *(int*)p, *(int*)p, *(float*)p);
3546     if (sz == 8)
3547         return cv::format("%lld / %lluu / 0x%16llx / %g", *(long long*)p, *(long long*)p, *(long long*)p, *(double*)p);
3548     return cv::format("%p", p);
3549 }
3550
3551 int Kernel::set(int i, const void* value, size_t sz)
3552 {
3553     if (!p || !p->handle)
3554         return -1;
3555     if (i < 0)
3556         return i;
3557     if( i == 0 )
3558         p->cleanupUMats();
3559
3560     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
3561     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, value=%s)", p->name.c_str(), (int)i, (int)sz, dumpValue(sz, value).c_str()).c_str());
3562     if (retval != CL_SUCCESS)
3563         return -1;
3564     return i+1;
3565 }
3566
3567 int Kernel::set(int i, const Image2D& image2D)
3568 {
3569     cl_mem h = (cl_mem)image2D.ptr();
3570     int res = set(i, &h, sizeof(h));
3571     if (res >= 0)
3572         p->registerImageArgument(i, image2D);
3573     return res;
3574 }
3575
3576 int Kernel::set(int i, const UMat& m)
3577 {
3578     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m));
3579 }
3580
3581 int Kernel::set(int i, const KernelArg& arg)
3582 {
3583     if( !p || !p->handle )
3584         return -1;
3585     if (i < 0)
3586     {
3587         CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d): negative arg_index",
3588                 p->name.c_str(), (int)i));
3589         return i;
3590     }
3591     if( i == 0 )
3592         p->cleanupUMats();
3593     cl_int status = 0;
3594     if( arg.m )
3595     {
3596         AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
3597                                  ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
3598         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
3599         if (ptronly && arg.m->empty())
3600         {
3601             cl_mem h_null = (cl_mem)NULL;
3602             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h_null), &h_null);
3603             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=NULL)", p->name.c_str(), (int)i).c_str());
3604             return i + 1;
3605         }
3606         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
3607
3608         if (!h)
3609         {
3610             CV_LOG_ERROR(NULL, cv::format("OpenCL: Kernel(%s)::set(arg_index=%d, flags=%d): can't create cl_mem handle for passed UMat buffer (addr=%p)",
3611                     p->name.c_str(), (int)i, (int)arg.flags, arg.m));
3612             p->release();
3613             p = 0;
3614             return -1;
3615         }
3616
3617 #ifdef HAVE_OPENCL_SVM
3618         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
3619         {
3620             const Context& ctx = Context::getDefault();
3621             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
3622             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
3623             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
3624 #if 1 // TODO
3625             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
3626 #else
3627             status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
3628 #endif
3629             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArgSVMPointer('%s', arg_index=%d, ptr=%p)", p->name.c_str(), (int)i, (void*)svmDataPtr).c_str());
3630         }
3631         else
3632 #endif
3633         {
3634             status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h);
3635             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cl_mem=%p)", p->name.c_str(), (int)i, (void*)h).c_str());
3636         }
3637
3638         if (ptronly)
3639         {
3640             i++;
3641         }
3642         else if( arg.m->dims <= 2 )
3643         {
3644             UMat2D u2d(*arg.m);
3645             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step);
3646             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+1), (int)u2d.step).c_str());
3647             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset);
3648             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+2), (int)u2d.offset).c_str());
3649             i += 3;
3650
3651             if( !(arg.flags & KernelArg::NO_SIZE) )
3652             {
3653                 int cols = u2d.cols*arg.wscale/arg.iwscale;
3654                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows);
3655                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)i, (int)u2d.rows).c_str());
3656                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols);
3657                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+1), (int)cols).c_str());
3658                 i += 2;
3659             }
3660         }
3661         else
3662         {
3663             UMat3D u3d(*arg.m);
3664             status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep);
3665             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slicestep_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.slicestep).c_str());
3666             status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step);
3667             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, step_value=%d)", p->name.c_str(), (int)(i+2), (int)u3d.step).c_str());
3668             status = clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset);
3669             CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, offset_value=%d)", p->name.c_str(), (int)(i+3), (int)u3d.offset).c_str());
3670             i += 4;
3671             if( !(arg.flags & KernelArg::NO_SIZE) )
3672             {
3673                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3674                 status = clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.slices);
3675                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, slices_value=%d)", p->name.c_str(), (int)i, (int)u3d.slices).c_str());
3676                 status = clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows);
3677                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, rows_value=%d)", p->name.c_str(), (int)(i+1), (int)u3d.rows).c_str());
3678                 status = clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols);
3679                 CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, cols_value=%d)", p->name.c_str(), (int)(i+2), (int)cols).c_str());
3680                 i += 3;
3681             }
3682         }
3683         p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
3684         return i;
3685     }
3686     status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
3687     CV_OCL_DBG_CHECK_RESULT(status, cv::format("clSetKernelArg('%s', arg_index=%d, size=%d, obj=%p)", p->name.c_str(), (int)i, (int)arg.sz, (void*)arg.obj).c_str());
3688     return i+1;
3689 }
3690
3691 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3692                  bool sync, const Queue& q)
3693 {
3694     if (!p)
3695         return false;
3696
3697     size_t globalsize[CV_MAX_DIM] = {1,1,1};
3698     size_t total = 1;
3699     CV_Assert(_globalsize != NULL);
3700     for (int i = 0; i < dims; i++)
3701     {
3702         size_t val = _localsize ? _localsize[i] :
3703             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3704         CV_Assert( val > 0 );
3705         total *= _globalsize[i];
3706         if (_globalsize[i] == 1 && !_localsize)
3707             val = 1;
3708         globalsize[i] = divUp(_globalsize[i], (unsigned int)val) * val;
3709     }
3710     CV_Assert(total > 0);
3711
3712     return p->run(dims, globalsize, _localsize, sync, NULL, q);
3713 }
3714
3715
3716 static bool isRaiseErrorOnReuseAsyncKernel()
3717 {
3718     static bool initialized = false;
3719     static bool value = false;
3720     if (!initialized)
3721     {
3722         value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_RAISE_ERROR_REUSE_ASYNC_KERNEL", false);
3723         initialized = true;
3724     }
3725     return value;
3726 }
3727
3728 bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
3729         bool sync, int64* timeNS, const Queue& q)
3730 {
3731     CV_INSTRUMENT_REGION_OPENCL_RUN(name.c_str());
3732
3733     if (!handle)
3734     {
3735         CV_LOG_ERROR(NULL, "OpenCL kernel has zero handle: " << name);
3736         return false;
3737     }
3738
3739     if (isAsyncRun)
3740     {
3741         CV_LOG_ERROR(NULL, "OpenCL kernel can't be reused in async mode: " << name);
3742         if (isRaiseErrorOnReuseAsyncKernel())
3743             CV_Assert(0);
3744         return false;  // OpenCV 5.0: raise error
3745     }
3746     isAsyncRun = !sync;
3747
3748     if (isInProgress)
3749     {
3750         CV_LOG_ERROR(NULL, "Previous OpenCL kernel launch is not finished: " << name);
3751         if (isRaiseErrorOnReuseAsyncKernel())
3752             CV_Assert(0);
3753         return false;  // OpenCV 5.0: raise error
3754     }
3755
3756     cl_command_queue qq = getQueue(q);
3757     if (haveTempDstUMats)
3758         sync = true;
3759     if (haveTempSrcUMats)
3760         sync = true;
3761     if (timeNS)
3762         sync = true;
3763     cl_event asyncEvent = 0;
3764     cl_int retval = clEnqueueNDRangeKernel(qq, handle, (cl_uint)dims,
3765                                            NULL, globalsize, localsize, 0, 0,
3766                                            (sync && !timeNS) ? 0 : &asyncEvent);
3767 #if !CV_OPENCL_SHOW_RUN_KERNELS
3768     if (retval != CL_SUCCESS)
3769 #endif
3770     {
3771         cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
3772                         globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
3773                         (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
3774                         sync ? "true" : "false"
3775                         );
3776         if (retval != CL_SUCCESS)
3777         {
3778             msg = CV_OCL_API_ERROR_MSG(retval, msg.c_str());
3779         }
3780 #if CV_OPENCL_TRACE_CHECK
3781         CV_OCL_TRACE_CHECK_RESULT(retval, msg.c_str());
3782 #else
3783         printf("%s\n", msg.c_str());
3784         fflush(stdout);
3785 #endif
3786     }
3787     if (sync || retval != CL_SUCCESS)
3788     {
3789         CV_OCL_DBG_CHECK(clFinish(qq));
3790         if (timeNS)
3791         {
3792             if (retval == CL_SUCCESS)
3793             {
3794                 CV_OCL_DBG_CHECK(clWaitForEvents(1, &asyncEvent));
3795                 cl_ulong startTime, stopTime;
3796                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, NULL));
3797                 CV_OCL_CHECK(clGetEventProfilingInfo(asyncEvent, CL_PROFILING_COMMAND_END, sizeof(stopTime), &stopTime, NULL));
3798                 *timeNS = (int64)(stopTime - startTime);
3799             }
3800             else
3801             {
3802                 *timeNS = -1;
3803             }
3804         }
3805         cleanupUMats();
3806     }
3807     else
3808     {
3809         addref();
3810         isInProgress = true;
3811         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, this));
3812     }
3813     if (asyncEvent)
3814         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3815     return retval == CL_SUCCESS;
3816 }
3817
3818 bool Kernel::runTask(bool sync, const Queue& q)
3819 {
3820     if(!p || !p->handle || p->isInProgress)
3821         return false;
3822
3823     cl_command_queue qq = getQueue(q);
3824     cl_event asyncEvent = 0;
3825     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &asyncEvent);
3826     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueTask('%s') sync=%s", p->name.c_str(), sync ? "true" : "false").c_str());
3827     if (sync || retval != CL_SUCCESS)
3828     {
3829         CV_OCL_DBG_CHECK(clFinish(qq));
3830         p->cleanupUMats();
3831     }
3832     else
3833     {
3834         p->addref();
3835         p->isInProgress = true;
3836         CV_OCL_CHECK(clSetEventCallback(asyncEvent, CL_COMPLETE, oclCleanupCallback, p));
3837     }
3838     if (asyncEvent)
3839         CV_OCL_DBG_CHECK(clReleaseEvent(asyncEvent));
3840     return retval == CL_SUCCESS;
3841 }
3842
3843 int64 Kernel::runProfiling(int dims, size_t globalsize[], size_t localsize[], const Queue& q_)
3844 {
3845     CV_Assert(p && p->handle && !p->isInProgress);
3846     Queue q = q_.ptr() ? q_ : Queue::getDefault();
3847     CV_Assert(q.ptr());
3848     q.finish(); // call clFinish() on base queue
3849     Queue profilingQueue = q.getProfilingQueue();
3850     int64 timeNs = -1;
3851     bool res = p->run(dims, globalsize, localsize, true, &timeNs, profilingQueue);
3852     return res ? timeNs : -1;
3853 }
3854
3855 size_t Kernel::workGroupSize() const
3856 {
3857     if(!p || !p->handle)
3858         return 0;
3859     size_t val = 0, retsz = 0;
3860     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3861     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, sizeof(val), &val, &retsz);
3862     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)");
3863     return status == CL_SUCCESS ? val : 0;
3864 }
3865
3866 size_t Kernel::preferedWorkGroupSizeMultiple() const
3867 {
3868     if(!p || !p->handle)
3869         return 0;
3870     size_t val = 0, retsz = 0;
3871     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3872     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(val), &val, &retsz);
3873     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE)");
3874     return status == CL_SUCCESS ? val : 0;
3875 }
3876
3877 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3878 {
3879     if(!p || !p->handle || !wsz)
3880         return 0;
3881     size_t retsz = 0;
3882     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3883     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, sizeof(wsz[0])*3, wsz, &retsz);
3884     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_COMPILE_WORK_GROUP_SIZE)");
3885     return status == CL_SUCCESS;
3886 }
3887
3888 size_t Kernel::localMemSize() const
3889 {
3890     if(!p || !p->handle)
3891         return 0;
3892     size_t retsz = 0;
3893     cl_ulong val = 0;
3894     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3895     cl_int status = clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(val), &val, &retsz);
3896     CV_OCL_CHECK_RESULT(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)");
3897     return status == CL_SUCCESS ? (size_t)val : 0;
3898 }
3899
3900
3901
3902 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3903
3904 struct ProgramSource::Impl
3905 {
3906     IMPLEMENT_REFCOUNTABLE();
3907
3908     enum KIND {
3909         PROGRAM_SOURCE_CODE = 0,
3910         PROGRAM_BINARIES,
3911         PROGRAM_SPIR,
3912         PROGRAM_SPIRV
3913     } kind_;
3914
3915     Impl(const String& src)
3916     {
3917         init(PROGRAM_SOURCE_CODE, cv::String(), cv::String());
3918         initFromSource(src, cv::String());
3919     }
3920     Impl(const String& module, const String& name, const String& codeStr, const String& codeHash)
3921     {
3922         init(PROGRAM_SOURCE_CODE, module, name);
3923         initFromSource(codeStr, codeHash);
3924     }
3925
3926     /// reset fields
3927     void init(enum KIND kind, const String& module, const String& name)
3928     {
3929         refcount = 1;
3930         kind_ = kind;
3931         module_ = module;
3932         name_ = name;
3933
3934         sourceAddr_ = NULL;
3935         sourceSize_ = 0;
3936         isHashUpdated = false;
3937     }
3938
3939     void initFromSource(const String& codeStr, const String& codeHash)
3940     {
3941         codeStr_ = codeStr;
3942         sourceHash_ = codeHash;
3943         if (sourceHash_.empty())
3944         {
3945             updateHash();
3946         }
3947         else
3948         {
3949             isHashUpdated = true;
3950         }
3951     }
3952
3953     void updateHash(const char* hashStr = NULL)
3954     {
3955         if (hashStr)
3956         {
3957             sourceHash_ = cv::String(hashStr);
3958             isHashUpdated = true;
3959             return;
3960         }
3961         uint64 hash = 0;
3962         switch (kind_)
3963         {
3964         case PROGRAM_SOURCE_CODE:
3965             if (sourceAddr_)
3966             {
3967                 CV_Assert(codeStr_.empty());
3968                 hash = crc64(sourceAddr_, sourceSize_); // static storage
3969             }
3970             else
3971             {
3972                 CV_Assert(!codeStr_.empty());
3973                 hash = crc64((uchar*)codeStr_.c_str(), codeStr_.size());
3974             }
3975             break;
3976         case PROGRAM_BINARIES:
3977         case PROGRAM_SPIR:
3978         case PROGRAM_SPIRV:
3979             hash = crc64(sourceAddr_, sourceSize_);
3980             break;
3981         default:
3982             CV_Error(Error::StsInternal, "Internal error");
3983         }
3984         sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
3985         isHashUpdated = true;
3986     }
3987
3988     Impl(enum KIND kind,
3989             const String& module, const String& name,
3990             const unsigned char* binary, const size_t size,
3991             const cv::String& buildOptions = cv::String())
3992     {
3993         init(kind, module, name);
3994
3995         sourceAddr_ = binary;
3996         sourceSize_ = size;
3997
3998         buildOptions_ = buildOptions;
3999     }
4000
4001     static ProgramSource fromSourceWithStaticLifetime(const String& module, const String& name,
4002             const char* sourceCodeStaticStr, const char* hashStaticStr,
4003             const cv::String& buildOptions)
4004     {
4005         ProgramSource result;
4006         result.p = new Impl(PROGRAM_SOURCE_CODE, module, name,
4007                 (const unsigned char*)sourceCodeStaticStr, strlen(sourceCodeStaticStr), buildOptions);
4008         result.p->updateHash(hashStaticStr);
4009         return result;
4010     }
4011
4012     static ProgramSource fromBinary(const String& module, const String& name,
4013             const unsigned char* binary, const size_t size,
4014             const cv::String& buildOptions)
4015     {
4016         ProgramSource result;
4017         result.p = new Impl(PROGRAM_BINARIES, module, name, binary, size, buildOptions);
4018         return result;
4019     }
4020
4021     static ProgramSource fromSPIR(const String& module, const String& name,
4022             const unsigned char* binary, const size_t size,
4023             const cv::String& buildOptions)
4024     {
4025         ProgramSource result;
4026         result.p = new Impl(PROGRAM_SPIR, module, name, binary, size, buildOptions);
4027         return result;
4028     }
4029
4030     String module_;
4031     String name_;
4032
4033     // TODO std::vector<ProgramSource> includes_;
4034     String codeStr_; // PROGRAM_SOURCE_CODE only
4035
4036     const unsigned char* sourceAddr_;
4037     size_t sourceSize_;
4038
4039     cv::String buildOptions_;
4040
4041     String sourceHash_;
4042     bool isHashUpdated;
4043
4044     friend struct Program::Impl;
4045     friend struct internal::ProgramEntry;
4046     friend struct Context::Impl;
4047 };
4048
4049
4050 ProgramSource::ProgramSource()
4051 {
4052     p = 0;
4053 }
4054
4055 ProgramSource::ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash)
4056 {
4057     p = new Impl(module, name, codeStr, codeHash);
4058 }
4059
4060 ProgramSource::ProgramSource(const char* prog)
4061 {
4062     p = new Impl(prog);
4063 }
4064
4065 ProgramSource::ProgramSource(const String& prog)
4066 {
4067     p = new Impl(prog);
4068 }
4069
4070 ProgramSource::~ProgramSource()
4071 {
4072     if(p)
4073         p->release();
4074 }
4075
4076 ProgramSource::ProgramSource(const ProgramSource& prog)
4077 {
4078     p = prog.p;
4079     if(p)
4080         p->addref();
4081 }
4082
4083 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
4084 {
4085     Impl* newp = (Impl*)prog.p;
4086     if(newp)
4087         newp->addref();
4088     if(p)
4089         p->release();
4090     p = newp;
4091     return *this;
4092 }
4093
4094 const String& ProgramSource::source() const
4095 {
4096     CV_Assert(p);
4097     CV_Assert(p->kind_ == Impl::PROGRAM_SOURCE_CODE);
4098     CV_Assert(p->sourceAddr_ == NULL); // method returns reference - can't construct temporary object
4099     return p->codeStr_;
4100 }
4101
4102 ProgramSource::hash_t ProgramSource::hash() const
4103 {
4104     CV_Error(Error::StsNotImplemented, "Removed method: ProgramSource::hash()");
4105 }
4106
4107 ProgramSource ProgramSource::fromBinary(const String& module, const String& name,
4108         const unsigned char* binary, const size_t size,
4109         const cv::String& buildOptions)
4110 {
4111     CV_Assert(binary);
4112     CV_Assert(size > 0);
4113     return Impl::fromBinary(module, name, binary, size, buildOptions);
4114 }
4115
4116 ProgramSource ProgramSource::fromSPIR(const String& module, const String& name,
4117         const unsigned char* binary, const size_t size,
4118         const cv::String& buildOptions)
4119 {
4120     CV_Assert(binary);
4121     CV_Assert(size > 0);
4122     return Impl::fromBinary(module, name, binary, size, buildOptions);
4123 }
4124
4125
4126 internal::ProgramEntry::operator ProgramSource&() const
4127 {
4128     if (this->pProgramSource == NULL)
4129     {
4130         cv::AutoLock lock(cv::getInitializationMutex());
4131         if (this->pProgramSource == NULL)
4132         {
4133             ProgramSource ps = ProgramSource::Impl::fromSourceWithStaticLifetime(this->module, this->name, this->programCode, this->programHash, cv::String());
4134             ProgramSource* ptr = new ProgramSource(ps);
4135             const_cast<ProgramEntry*>(this)->pProgramSource = ptr;
4136         }
4137     }
4138     return *this->pProgramSource;
4139 }
4140
4141
4142
4143 /////////////////////////////////////////// Program /////////////////////////////////////////////
4144
4145 static
4146 cv::String joinBuildOptions(const cv::String& a, const cv::String& b)
4147 {
4148     if (b.empty())
4149         return a;
4150     if (a.empty())
4151         return b;
4152     if (b[0] == ' ')
4153         return a + b;
4154     return a + (cv::String(" ") + b);
4155 }
4156
4157 struct Program::Impl
4158 {
4159     IMPLEMENT_REFCOUNTABLE();
4160
4161     Impl(const ProgramSource& src,
4162          const String& _buildflags, String& errmsg) :
4163          refcount(1),
4164          handle(NULL),
4165          buildflags(_buildflags)
4166     {
4167         const ProgramSource::Impl* src_ = src.getImpl();
4168         CV_Assert(src_);
4169         sourceModule_ = src_->module_;
4170         sourceName_ = src_->name_;
4171         const Context ctx = Context::getDefault();
4172         Device device = ctx.device(0);
4173         if (ctx.ptr() == NULL || device.ptr() == NULL)
4174             return;
4175         buildflags = joinBuildOptions(buildflags, src_->buildOptions_);
4176         if (src.getImpl()->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4177         {
4178             if (device.isAMD())
4179                 buildflags = joinBuildOptions(buildflags, " -D AMD_DEVICE");
4180             else if (device.isIntel())
4181                 buildflags = joinBuildOptions(buildflags, " -D INTEL_DEVICE");
4182             const String param_buildExtraOptions = getBuildExtraOptions();
4183             if (!param_buildExtraOptions.empty())
4184                 buildflags = joinBuildOptions(buildflags, param_buildExtraOptions);
4185         }
4186         compile(ctx, src_, errmsg);
4187     }
4188
4189     bool compile(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4190     {
4191         CV_Assert(ctx.getImpl());
4192         CV_Assert(src_);
4193
4194         // We don't cache OpenCL binaries
4195         if (src_->kind_ == ProgramSource::Impl::PROGRAM_BINARIES)
4196         {
4197             CV_LOG_VERBOSE(NULL, 0, "Load program binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4198             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4199             return isLoaded;
4200         }
4201         return compileWithCache(ctx, src_, errmsg);
4202     }
4203
4204     bool compileWithCache(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4205     {
4206         CV_Assert(ctx.getImpl());
4207         CV_Assert(src_);
4208         CV_Assert(src_->kind_ != ProgramSource::Impl::PROGRAM_BINARIES);
4209
4210 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4211         OpenCLBinaryCacheConfigurator& config = OpenCLBinaryCacheConfigurator::getSingletonInstance();
4212         const std::string base_dir = config.prepareCacheDirectoryForContext(
4213                 ctx.getImpl()->getPrefixString(),
4214                 ctx.getImpl()->getPrefixBase()
4215         );
4216         const String& hash_str = src_->sourceHash_;
4217         cv::String fname;
4218         if (!base_dir.empty() && !src_->module_.empty() && !src_->name_.empty())
4219         {
4220             CV_Assert(!hash_str.empty());
4221             fname = src_->module_ + "--" + src_->name_ + "_" + hash_str + ".bin";
4222             fname = utils::fs::join(base_dir, fname);
4223         }
4224         const cv::Ptr<utils::fs::FileLock> fileLock = config.cache_lock_; // can be empty
4225         if (!fname.empty() && CV_OPENCL_CACHE_ENABLE)
4226         {
4227             try
4228             {
4229                 std::vector<char> binaryBuf;
4230                 bool res = false;
4231                 {
4232                     cv::utils::optional_shared_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4233                     BinaryProgramFile file(fname, hash_str.c_str());
4234                     res = file.read(buildflags, binaryBuf);
4235                 }
4236                 if (res)
4237                 {
4238                     CV_Assert(!binaryBuf.empty());
4239                     CV_LOG_VERBOSE(NULL, 0, "Load program binary from cache: " << src_->module_.c_str() << "/" << src_->name_.c_str());
4240                     bool isLoaded = createFromBinary(ctx, binaryBuf, errmsg);
4241                     if (isLoaded)
4242                         return true;
4243                 }
4244             }
4245             catch (const cv::Exception& e)
4246             {
4247                 CV_UNUSED(e);
4248                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname << std::endl << e.what());
4249             }
4250             catch (...)
4251             {
4252                 CV_LOG_VERBOSE(NULL, 0, "Can't load OpenCL binary: " + fname);
4253             }
4254         }
4255 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4256         CV_Assert(handle == NULL);
4257         if (src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE)
4258         {
4259             if (!buildFromSources(ctx, src_, errmsg))
4260             {
4261                 return false;
4262             }
4263         }
4264         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIR)
4265         {
4266             buildflags = joinBuildOptions(buildflags, " -x spir");
4267             if ((cv::String(" ") + buildflags).find(" -spir-std=") == cv::String::npos)
4268             {
4269                 buildflags = joinBuildOptions(buildflags, " -spir-std=1.2");
4270             }
4271             CV_LOG_VERBOSE(NULL, 0, "Load program SPIR binary... " << src_->module_.c_str() << "/" << src_->name_.c_str());
4272             bool isLoaded = createFromBinary(ctx, src_->sourceAddr_, src_->sourceSize_, errmsg);
4273             if (!isLoaded)
4274                 return false;
4275         }
4276         else if (src_->kind_ == ProgramSource::Impl::PROGRAM_SPIRV)
4277         {
4278             CV_Error(Error::StsNotImplemented, "OpenCL: SPIR-V is not supported");
4279         }
4280         else
4281         {
4282             CV_Error(Error::StsInternal, "Internal error");
4283         }
4284         CV_Assert(handle != NULL);
4285 #if OPENCV_HAVE_FILESYSTEM_SUPPORT
4286         if (!fname.empty() && CV_OPENCL_CACHE_WRITE)
4287         {
4288             try
4289             {
4290                 std::vector<char> binaryBuf;
4291                 getProgramBinary(binaryBuf);
4292                 {
4293                     cv::utils::optional_lock_guard<cv::utils::fs::FileLock> lock_fs(fileLock.get());
4294                     BinaryProgramFile file(fname, hash_str.c_str());
4295                     file.write(buildflags, binaryBuf);
4296                 }
4297             }
4298             catch (const cv::Exception& e)
4299             {
4300                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname << std::endl << e.what());
4301             }
4302             catch (...)
4303             {
4304                 CV_LOG_WARNING(NULL, "Can't save OpenCL binary into cache: " + fname);
4305             }
4306         }
4307 #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
4308 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4309         if (CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4310         {
4311             std::vector<char> binaryBuf;
4312             getProgramBinary(binaryBuf);
4313             if (!binaryBuf.empty())
4314             {
4315                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4316                 handle = NULL;
4317                 createFromBinary(ctx, binaryBuf, errmsg);
4318             }
4319         }
4320 #endif
4321         return handle != NULL;
4322     }
4323
4324     void dumpBuildLog_(cl_int result, const cl_device_id* deviceList, String& errmsg)
4325     {
4326         AutoBuffer<char, 4096> buffer; buffer[0] = 0;
4327
4328         size_t retsz = 0;
4329         cl_int log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4330                                                   CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
4331         if (log_retval == CL_SUCCESS && retsz > 1)
4332         {
4333             buffer.resize(retsz + 16);
4334             log_retval = clGetProgramBuildInfo(handle, deviceList[0],
4335                                                CL_PROGRAM_BUILD_LOG, retsz+1, buffer.data(), &retsz);
4336             if (log_retval == CL_SUCCESS)
4337             {
4338                 if (retsz < buffer.size())
4339                     buffer[retsz] = 0;
4340                 else
4341                     buffer[buffer.size() - 1] = 0;
4342             }
4343             else
4344             {
4345                 buffer[0] = 0;
4346             }
4347         }
4348
4349         errmsg = String(buffer.data());
4350         printf("OpenCL program build log: %s/%s\nStatus %d: %s\n%s\n%s\n",
4351                 sourceModule_.c_str(), sourceName_.c_str(),
4352                 result, getOpenCLErrorString(result),
4353                 buildflags.c_str(), errmsg.c_str());
4354         fflush(stdout);
4355     }
4356
4357     bool buildFromSources(const Context& ctx, const ProgramSource::Impl* src_, String& errmsg)
4358     {
4359         CV_Assert(src_);
4360         CV_Assert(src_->kind_ == ProgramSource::Impl::PROGRAM_SOURCE_CODE);
4361         CV_Assert(handle == NULL);
4362         CV_INSTRUMENT_REGION_OPENCL_COMPILE(cv::format("Build OpenCL program: %s/%s %s options: %s",
4363                 sourceModule_.c_str(), sourceName_.c_str(),
4364                 src_->sourceHash_.c_str(), buildflags.c_str()).c_str());
4365
4366         CV_LOG_VERBOSE(NULL, 0, "Compile... " << sourceModule_.c_str() << "/" << sourceName_.c_str());
4367
4368         const char* srcptr = src_->sourceAddr_ ? ((const char*)src_->sourceAddr_) : src_->codeStr_.c_str();
4369         size_t srclen = src_->sourceAddr_ ? src_->sourceSize_ : src_->codeStr_.size();
4370         CV_Assert(srcptr != NULL);
4371         CV_Assert(srclen > 0);
4372
4373         cl_int retval = 0;
4374
4375         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
4376         CV_OCL_DBG_CHECK_RESULT(retval, "clCreateProgramWithSource");
4377         CV_Assert(handle || retval != CL_SUCCESS);
4378         if (handle && retval == CL_SUCCESS)
4379         {
4380             size_t n = ctx.ndevices();
4381             AutoBuffer<cl_device_id, 4> deviceListBuf(n + 1);
4382             cl_device_id* deviceList = deviceListBuf.data();
4383             for (size_t i = 0; i < n; i++)
4384             {
4385                 deviceList[i] = (cl_device_id)(ctx.device(i).ptr());
4386             }
4387
4388             retval = clBuildProgram(handle, (cl_uint)n, deviceList, buildflags.c_str(), 0, 0);
4389             CV_OCL_TRACE_CHECK_RESULT(/*don't throw: retval*/CL_SUCCESS, cv::format("clBuildProgram(source: %s)", buildflags.c_str()).c_str());
4390 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
4391             if (retval != CL_SUCCESS)
4392 #endif
4393             {
4394                 dumpBuildLog_(retval, deviceList, errmsg);
4395
4396                 // don't remove "retval != CL_SUCCESS" condition here:
4397                 // it would break CV_OPENCL_ALWAYS_SHOW_BUILD_LOG mode
4398                 if (retval != CL_SUCCESS && handle)
4399                 {
4400                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4401                     handle = NULL;
4402                 }
4403             }
4404 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4405             if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4406             {
4407                 CV_LOG_INFO(NULL, "OpenCL: query kernel names (build from sources)...");
4408                 size_t retsz = 0;
4409                 char kernels_buffer[4096] = {0};
4410                 cl_int result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4411                 if (retsz < sizeof(kernels_buffer))
4412                     kernels_buffer[retsz] = 0;
4413                 else
4414                     kernels_buffer[0] = 0;
4415                 CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4416             }
4417 #endif
4418
4419         }
4420         return handle != NULL;
4421     }
4422
4423     void getProgramBinary(std::vector<char>& buf)
4424     {
4425         CV_Assert(handle);
4426         size_t sz = 0;
4427         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(sz), &sz, NULL));
4428         buf.resize(sz);
4429         uchar* ptr = (uchar*)&buf[0];
4430         CV_OCL_CHECK(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(ptr), &ptr, NULL));
4431     }
4432
4433     bool createFromBinary(const Context& ctx, const std::vector<char>& buf, String& errmsg)
4434     {
4435         return createFromBinary(ctx, (const unsigned char*)&buf[0], buf.size(), errmsg);
4436     }
4437
4438     bool createFromBinary(const Context& ctx, const unsigned char* binaryAddr, const size_t binarySize, String& errmsg)
4439     {
4440         CV_Assert(handle == NULL);
4441         CV_INSTRUMENT_REGION_OPENCL_COMPILE("Load OpenCL program");
4442         CV_LOG_VERBOSE(NULL, 0, "Load from binary... (" << binarySize << " bytes)");
4443
4444         CV_Assert(binarySize > 0);
4445
4446         size_t ndevices = (int)ctx.ndevices();
4447         AutoBuffer<cl_device_id> devices_(ndevices);
4448         AutoBuffer<const uchar*> binaryPtrs_(ndevices);
4449         AutoBuffer<size_t> binarySizes_(ndevices);
4450
4451         cl_device_id* devices = devices_.data();
4452         const uchar** binaryPtrs = binaryPtrs_.data();
4453         size_t* binarySizes = binarySizes_.data();
4454         for (size_t i = 0; i < ndevices; i++)
4455         {
4456             devices[i] = (cl_device_id)ctx.device(i).ptr();
4457             binaryPtrs[i] = binaryAddr;
4458             binarySizes[i] = binarySize;
4459         }
4460
4461         cl_int result = 0;
4462         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), (cl_uint)ndevices, devices_.data(),
4463                                            binarySizes, binaryPtrs, NULL, &result);
4464         if (result != CL_SUCCESS)
4465         {
4466             CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clCreateProgramWithBinary"));
4467             if (handle)
4468             {
4469                 CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4470                 handle = NULL;
4471             }
4472         }
4473         if (!handle)
4474         {
4475             return false;
4476         }
4477         // call clBuildProgram()
4478         {
4479             result = clBuildProgram(handle, (cl_uint)ndevices, devices_.data(), buildflags.c_str(), 0, 0);
4480             CV_OCL_DBG_CHECK_RESULT(result, cv::format("clBuildProgram(binary: %s/%s)", sourceModule_.c_str(), sourceName_.c_str()).c_str());
4481             if (result != CL_SUCCESS)
4482             {
4483                 dumpBuildLog_(result, devices, errmsg);
4484                 if (handle)
4485                 {
4486                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4487                     handle = NULL;
4488                 }
4489                 return false;
4490             }
4491         }
4492         // check build status
4493         {
4494             cl_build_status build_status = CL_BUILD_NONE;
4495             size_t retsz = 0;
4496             CV_OCL_DBG_CHECK(result = clGetProgramBuildInfo(handle, devices[0], CL_PROGRAM_BUILD_STATUS,
4497                     sizeof(build_status), &build_status, &retsz));
4498             if (result == CL_SUCCESS)
4499             {
4500                 if (build_status == CL_BUILD_SUCCESS)
4501                 {
4502                     return true;
4503                 }
4504                 else
4505                 {
4506                     CV_LOG_WARNING(NULL, "clGetProgramBuildInfo() returns " << build_status);
4507                     return false;
4508                 }
4509             }
4510             else
4511             {
4512                 CV_LOG_ERROR(NULL, CV_OCL_API_ERROR_MSG(result, "clGetProgramBuildInfo()"));
4513                 if (handle)
4514                 {
4515                     CV_OCL_DBG_CHECK(clReleaseProgram(handle));
4516                     handle = NULL;
4517                 }
4518             }
4519         }
4520 #if CV_OPENCL_VALIDATE_BINARY_PROGRAMS
4521         if (handle && CV_OPENCL_VALIDATE_BINARY_PROGRAMS_VALUE)
4522         {
4523             CV_LOG_INFO(NULL, "OpenCL: query kernel names (binary)...");
4524             size_t retsz = 0;
4525             char kernels_buffer[4096] = {0};
4526             result = clGetProgramInfo(handle, CL_PROGRAM_KERNEL_NAMES, sizeof(kernels_buffer), &kernels_buffer[0], &retsz);
4527             if (retsz < sizeof(kernels_buffer))
4528                 kernels_buffer[retsz] = 0;
4529             else
4530                 kernels_buffer[0] = 0;
4531             CV_LOG_INFO(NULL, result << ": Kernels='" << kernels_buffer << "'");
4532         }
4533 #endif
4534         return handle != NULL;
4535     }
4536
4537     ~Impl()
4538     {
4539         if( handle )
4540         {
4541 #ifdef _WIN32
4542             if (!cv::__termination)
4543 #endif
4544             {
4545                 clReleaseProgram(handle);
4546             }
4547             handle = NULL;
4548         }
4549     }
4550
4551     cl_program handle;
4552
4553     String buildflags;
4554     String sourceModule_;
4555     String sourceName_;
4556 };
4557
4558
4559 Program::Program() { p = 0; }
4560
4561 Program::Program(const ProgramSource& src,
4562         const String& buildflags, String& errmsg)
4563 {
4564     p = 0;
4565     create(src, buildflags, errmsg);
4566 }
4567
4568 Program::Program(const Program& prog)
4569 {
4570     p = prog.p;
4571     if(p)
4572         p->addref();
4573 }
4574
4575 Program& Program::operator = (const Program& prog)
4576 {
4577     Impl* newp = (Impl*)prog.p;
4578     if(newp)
4579         newp->addref();
4580     if(p)
4581         p->release();
4582     p = newp;
4583     return *this;
4584 }
4585
4586 Program::~Program()
4587 {
4588     if(p)
4589         p->release();
4590 }
4591
4592 bool Program::create(const ProgramSource& src,
4593             const String& buildflags, String& errmsg)
4594 {
4595     if(p)
4596     {
4597         p->release();
4598         p = NULL;
4599     }
4600     p = new Impl(src, buildflags, errmsg);
4601     if(!p->handle)
4602     {
4603         p->release();
4604         p = 0;
4605     }
4606     return p != 0;
4607 }
4608
4609 void* Program::ptr() const
4610 {
4611     return p ? p->handle : 0;
4612 }
4613
4614 #ifndef OPENCV_REMOVE_DEPRECATED_API
4615 const ProgramSource& Program::source() const
4616 {
4617     CV_Error(Error::StsNotImplemented, "Removed API");
4618 }
4619
4620 bool Program::read(const String& bin, const String& buildflags)
4621 {
4622     CV_UNUSED(bin); CV_UNUSED(buildflags);
4623     CV_Error(Error::StsNotImplemented, "Removed API");
4624 }
4625
4626 bool Program::write(String& bin) const
4627 {
4628     CV_UNUSED(bin);
4629     CV_Error(Error::StsNotImplemented, "Removed API");
4630 }
4631
4632 String Program::getPrefix() const
4633 {
4634     if(!p)
4635         return String();
4636     Context::Impl* ctx_ = Context::getDefault().getImpl();
4637     CV_Assert(ctx_);
4638     return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), p->buildflags.c_str());
4639 }
4640
4641 String Program::getPrefix(const String& buildflags)
4642 {
4643         Context::Impl* ctx_ = Context::getDefault().getImpl();
4644         CV_Assert(ctx_);
4645         return cv::format("opencl=%s\nbuildflags=%s", ctx_->getPrefixString().c_str(), buildflags.c_str());
4646 }
4647 #endif // OPENCV_REMOVE_DEPRECATED_API
4648
4649 void Program::getBinary(std::vector<char>& binary) const
4650 {
4651     CV_Assert(p && "Empty program");
4652     p->getProgramBinary(binary);
4653 }
4654
4655 Program Context::Impl::getProg(const ProgramSource& src,
4656                                const String& buildflags, String& errmsg)
4657 {
4658     size_t limit = getProgramCountLimit();
4659     const ProgramSource::Impl* src_ = src.getImpl();
4660     CV_Assert(src_);
4661     String key = cv::format("module=%s name=%s codehash=%s\nopencl=%s\nbuildflags=%s",
4662             src_->module_.c_str(), src_->name_.c_str(), src_->sourceHash_.c_str(),
4663             getPrefixString().c_str(),
4664             buildflags.c_str());
4665     {
4666         cv::AutoLock lock(program_cache_mutex);
4667         phash_t::iterator it = phash.find(key);
4668         if (it != phash.end())
4669         {
4670             // TODO LRU cache
4671             CacheList::iterator i = std::find(cacheList.begin(), cacheList.end(), key);
4672             if (i != cacheList.end() && i != cacheList.begin())
4673             {
4674                 cacheList.erase(i);
4675                 cacheList.push_front(key);
4676             }
4677             return it->second;
4678         }
4679         { // cleanup program cache
4680             size_t sz = phash.size();
4681             if (limit > 0 && sz >= limit)
4682             {
4683                 static bool warningFlag = false;
4684                 if (!warningFlag)
4685                 {
4686                     printf("\nWARNING: OpenCV-OpenCL:\n"
4687                         "    In-memory cache for OpenCL programs is full, older programs will be unloaded.\n"
4688                         "    You can change cache size via OPENCV_OPENCL_PROGRAM_CACHE environment variable\n\n");
4689                     warningFlag = true;
4690                 }
4691                 while (!cacheList.empty())
4692                 {
4693                     size_t c = phash.erase(cacheList.back());
4694                     cacheList.pop_back();
4695                     if (c != 0)
4696                         break;
4697                 }
4698             }
4699         }
4700     }
4701     Program prog(src, buildflags, errmsg);
4702     // Cache result of build failures too (to prevent unnecessary compiler invocations)
4703     {
4704         cv::AutoLock lock(program_cache_mutex);
4705         phash.insert(std::pair<std::string, Program>(key, prog));
4706         cacheList.push_front(key);
4707     }
4708     return prog;
4709 }
4710
4711
4712 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
4713
4714 template<typename T>
4715 class OpenCLBufferPool
4716 {
4717 protected:
4718     ~OpenCLBufferPool() { }
4719 public:
4720     virtual T allocate(size_t size) = 0;
4721     virtual void release(T buffer) = 0;
4722 };
4723
4724 template <typename Derived, typename BufferEntry, typename T>
4725 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
4726 {
4727 private:
4728     inline Derived& derived() { return *static_cast<Derived*>(this); }
4729 protected:
4730     Mutex mutex_;
4731
4732     size_t currentReservedSize;
4733     size_t maxReservedSize;
4734
4735     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
4736     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
4737
4738     // synchronized
4739     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
4740     {
4741         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
4742         for (; i != allocatedEntries_.end(); ++i)
4743         {
4744             BufferEntry& e = *i;
4745             if (e.clBuffer_ == buffer)
4746             {
4747                 entry = e;
4748                 allocatedEntries_.erase(i);
4749                 return true;
4750             }
4751         }
4752         return false;
4753     }
4754
4755     // synchronized
4756     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
4757     {
4758         if (reservedEntries_.empty())
4759             return false;
4760         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4761         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
4762         BufferEntry result;
4763         size_t minDiff = (size_t)(-1);
4764         for (; i != reservedEntries_.end(); ++i)
4765         {
4766             BufferEntry& e = *i;
4767             if (e.capacity_ >= size)
4768             {
4769                 size_t diff = e.capacity_ - size;
4770                 if (diff < std::max((size_t)4096, size / 8) && (result_pos == reservedEntries_.end() || diff < minDiff))
4771                 {
4772                     minDiff = diff;
4773                     result_pos = i;
4774                     result = e;
4775                     if (diff == 0)
4776                         break;
4777                 }
4778             }
4779         }
4780         if (result_pos != reservedEntries_.end())
4781         {
4782             //CV_DbgAssert(result == *result_pos);
4783             reservedEntries_.erase(result_pos);
4784             entry = result;
4785             currentReservedSize -= entry.capacity_;
4786             allocatedEntries_.push_back(entry);
4787             return true;
4788         }
4789         return false;
4790     }
4791
4792     // synchronized
4793     void _checkSizeOfReservedEntries()
4794     {
4795         while (currentReservedSize > maxReservedSize)
4796         {
4797             CV_DbgAssert(!reservedEntries_.empty());
4798             const BufferEntry& entry = reservedEntries_.back();
4799             CV_DbgAssert(currentReservedSize >= entry.capacity_);
4800             currentReservedSize -= entry.capacity_;
4801             derived()._releaseBufferEntry(entry);
4802             reservedEntries_.pop_back();
4803         }
4804     }
4805
4806     inline size_t _allocationGranularity(size_t size)
4807     {
4808         // heuristic values
4809         if (size < 1024*1024)
4810             return 4096;  // don't work with buffers smaller than 4Kb (hidden allocation overhead issue)
4811         else if (size < 16*1024*1024)
4812             return 64*1024;
4813         else
4814             return 1024*1024;
4815     }
4816
4817 public:
4818     OpenCLBufferPoolBaseImpl()
4819         : currentReservedSize(0),
4820           maxReservedSize(0)
4821     {
4822         // nothing
4823     }
4824     virtual ~OpenCLBufferPoolBaseImpl()
4825     {
4826         freeAllReservedBuffers();
4827         CV_Assert(reservedEntries_.empty());
4828     }
4829 public:
4830     virtual T allocate(size_t size) CV_OVERRIDE
4831     {
4832         AutoLock locker(mutex_);
4833         BufferEntry entry;
4834         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
4835         {
4836             CV_DbgAssert(size <= entry.capacity_);
4837             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
4838         }
4839         else
4840         {
4841             derived()._allocateBufferEntry(entry, size);
4842         }
4843         return entry.clBuffer_;
4844     }
4845     virtual void release(T buffer) CV_OVERRIDE
4846     {
4847         AutoLock locker(mutex_);
4848         BufferEntry entry;
4849         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
4850         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
4851         {
4852             derived()._releaseBufferEntry(entry);
4853         }
4854         else
4855         {
4856             reservedEntries_.push_front(entry);
4857             currentReservedSize += entry.capacity_;
4858             _checkSizeOfReservedEntries();
4859         }
4860     }
4861
4862     virtual size_t getReservedSize() const CV_OVERRIDE { return currentReservedSize; }
4863     virtual size_t getMaxReservedSize() const CV_OVERRIDE { return maxReservedSize; }
4864     virtual void setMaxReservedSize(size_t size) CV_OVERRIDE
4865     {
4866         AutoLock locker(mutex_);
4867         size_t oldMaxReservedSize = maxReservedSize;
4868         maxReservedSize = size;
4869         if (maxReservedSize < oldMaxReservedSize)
4870         {
4871             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
4872             for (; i != reservedEntries_.end();)
4873             {
4874                 const BufferEntry& entry = *i;
4875                 if (entry.capacity_ > maxReservedSize / 8)
4876                 {
4877                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
4878                     currentReservedSize -= entry.capacity_;
4879                     derived()._releaseBufferEntry(entry);
4880                     i = reservedEntries_.erase(i);
4881                     continue;
4882                 }
4883                 ++i;
4884             }
4885             _checkSizeOfReservedEntries();
4886         }
4887     }
4888     virtual void freeAllReservedBuffers() CV_OVERRIDE
4889     {
4890         AutoLock locker(mutex_);
4891         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
4892         for (; i != reservedEntries_.end(); ++i)
4893         {
4894             const BufferEntry& entry = *i;
4895             derived()._releaseBufferEntry(entry);
4896         }
4897         reservedEntries_.clear();
4898         currentReservedSize = 0;
4899     }
4900 };
4901
4902 struct CLBufferEntry
4903 {
4904     cl_mem clBuffer_;
4905     size_t capacity_;
4906     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
4907 };
4908
4909 class OpenCLBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
4910 {
4911 public:
4912     typedef struct CLBufferEntry BufferEntry;
4913 protected:
4914     int createFlags_;
4915 public:
4916     OpenCLBufferPoolImpl(int createFlags = 0)
4917         : createFlags_(createFlags)
4918     {
4919     }
4920
4921     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4922     {
4923         CV_DbgAssert(entry.clBuffer_ == NULL);
4924         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4925         Context& ctx = Context::getDefault();
4926         cl_int retval = CL_SUCCESS;
4927         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
4928         CV_OCL_CHECK_RESULT(retval, cv::format("clCreateBuffer(capacity=%lld) => %p", (long long int)entry.capacity_, (void*)entry.clBuffer_).c_str());
4929         CV_Assert(entry.clBuffer_ != NULL);
4930         if(retval == CL_SUCCESS)
4931         {
4932             CV_IMPL_ADD(CV_IMPL_OCL);
4933         }
4934         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
4935                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4936         allocatedEntries_.push_back(entry);
4937     }
4938
4939     void _releaseBufferEntry(const BufferEntry& entry)
4940     {
4941         CV_Assert(entry.capacity_ != 0);
4942         CV_Assert(entry.clBuffer_ != NULL);
4943         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
4944                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4945         CV_OCL_DBG_CHECK(clReleaseMemObject(entry.clBuffer_));
4946     }
4947 };
4948
4949 #ifdef HAVE_OPENCL_SVM
4950 struct CLSVMBufferEntry
4951 {
4952     void* clBuffer_;
4953     size_t capacity_;
4954     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
4955 };
4956 class OpenCLSVMBufferPoolImpl CV_FINAL : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
4957 {
4958 public:
4959     typedef struct CLSVMBufferEntry BufferEntry;
4960 public:
4961     OpenCLSVMBufferPoolImpl()
4962     {
4963     }
4964
4965     void _allocateBufferEntry(BufferEntry& entry, size_t size)
4966     {
4967         CV_DbgAssert(entry.clBuffer_ == NULL);
4968         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
4969
4970         Context& ctx = Context::getDefault();
4971         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
4972         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
4973         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
4974                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
4975
4976         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
4977         CV_DbgAssert(svmFns->isValid());
4978
4979         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
4980         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
4981         CV_Assert(buf);
4982
4983         entry.clBuffer_ = buf;
4984         {
4985             CV_IMPL_ADD(CV_IMPL_OCL);
4986         }
4987         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
4988                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
4989         allocatedEntries_.push_back(entry);
4990     }
4991
4992     void _releaseBufferEntry(const BufferEntry& entry)
4993     {
4994         CV_Assert(entry.capacity_ != 0);
4995         CV_Assert(entry.clBuffer_ != NULL);
4996         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
4997                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
4998         Context& ctx = Context::getDefault();
4999         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5000         CV_DbgAssert(svmFns->isValid());
5001         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
5002         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
5003     }
5004 };
5005 #endif
5006
5007
5008
5009 template <bool readAccess, bool writeAccess>
5010 class AlignedDataPtr
5011 {
5012 protected:
5013     const size_t size_;
5014     uchar* const originPtr_;
5015     const size_t alignment_;
5016     uchar* ptr_;
5017     uchar* allocatedPtr_;
5018
5019 public:
5020     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
5021         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
5022     {
5023         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5024         CV_DbgAssert(!readAccess || ptr);
5025         if (((size_t)ptr_ & (alignment - 1)) != 0)
5026         {
5027             allocatedPtr_ = new uchar[size_ + alignment - 1];
5028             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5029             if (readAccess)
5030             {
5031                 memcpy(ptr_, originPtr_, size_);
5032             }
5033         }
5034     }
5035
5036     uchar* getAlignedPtr() const
5037     {
5038         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5039         return ptr_;
5040     }
5041
5042     ~AlignedDataPtr()
5043     {
5044         if (allocatedPtr_)
5045         {
5046             if (writeAccess)
5047             {
5048                 memcpy(originPtr_, ptr_, size_);
5049             }
5050             delete[] allocatedPtr_;
5051             allocatedPtr_ = NULL;
5052         }
5053         ptr_ = NULL;
5054     }
5055 private:
5056     AlignedDataPtr(const AlignedDataPtr&); // disabled
5057     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
5058 };
5059
5060 template <bool readAccess, bool writeAccess>
5061 class AlignedDataPtr2D
5062 {
5063 protected:
5064     const size_t size_;
5065     uchar* const originPtr_;
5066     const size_t alignment_;
5067     uchar* ptr_;
5068     uchar* allocatedPtr_;
5069     size_t rows_;
5070     size_t cols_;
5071     size_t step_;
5072
5073 public:
5074     AlignedDataPtr2D(uchar* ptr, size_t rows, size_t cols, size_t step, size_t alignment, size_t extrabytes=0)
5075         : size_(rows*step), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL), rows_(rows), cols_(cols), step_(step)
5076     {
5077         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
5078         CV_DbgAssert(!readAccess || ptr != NULL);
5079         if (ptr == 0 || ((size_t)ptr_ & (alignment - 1)) != 0)
5080         {
5081             allocatedPtr_ = new uchar[size_ + extrabytes + alignment - 1];
5082             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
5083             if (readAccess)
5084             {
5085                 for (size_t i = 0; i < rows_; i++)
5086                     memcpy(ptr_ + i*step_, originPtr_ + i*step_, cols_);
5087             }
5088         }
5089     }
5090
5091     uchar* getAlignedPtr() const
5092     {
5093         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
5094         return ptr_;
5095     }
5096
5097     ~AlignedDataPtr2D()
5098     {
5099         if (allocatedPtr_)
5100         {
5101             if (writeAccess)
5102             {
5103                 for (size_t i = 0; i < rows_; i++)
5104                     memcpy(originPtr_ + i*step_, ptr_ + i*step_, cols_);
5105             }
5106             delete[] allocatedPtr_;
5107             allocatedPtr_ = NULL;
5108         }
5109         ptr_ = NULL;
5110     }
5111 private:
5112     AlignedDataPtr2D(const AlignedDataPtr2D&); // disabled
5113     AlignedDataPtr2D& operator=(const AlignedDataPtr2D&); // disabled
5114 };
5115
5116 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
5117 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
5118 #endif
5119
5120
5121 void Context::Impl::__init_buffer_pools()
5122 {
5123     bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
5124     OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
5125     bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
5126     OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
5127
5128     size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
5129     size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
5130     bufferPool.setMaxReservedSize(poolSize);
5131     size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
5132     bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
5133
5134 #ifdef HAVE_OPENCL_SVM
5135     bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
5136     OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
5137     size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
5138     bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
5139 #endif
5140
5141     CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
5142 }
5143
5144 class OpenCLAllocator CV_FINAL : public MatAllocator
5145 {
5146 public:
5147     enum AllocatorFlags
5148     {
5149         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
5150         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1,
5151 #ifdef HAVE_OPENCL_SVM
5152         ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2,
5153 #endif
5154         ALLOCATOR_FLAGS_EXTERNAL_BUFFER = 1 << 3  // convertFromBuffer()
5155     };
5156
5157     OpenCLAllocator()
5158     {
5159         matStdAllocator = Mat::getDefaultAllocator();
5160     }
5161     ~OpenCLAllocator()
5162     {
5163         flushCleanupQueue();
5164     }
5165
5166     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
5167             AccessFlag flags, UMatUsageFlags usageFlags) const
5168     {
5169         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
5170         return u;
5171     }
5172
5173     static bool isOpenCLMapForced()  // force clEnqueueMapBuffer / clEnqueueUnmapMemObject OpenCL API
5174     {
5175         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_MAPPING", false);
5176         return value;
5177     }
5178     static bool isOpenCLCopyingForced()  // force clEnqueueReadBuffer[Rect] / clEnqueueWriteBuffer[Rect] OpenCL API
5179     {
5180         static bool value = cv::utils::getConfigurationParameterBool("OPENCV_OPENCL_BUFFER_FORCE_COPYING", false);
5181         return value;
5182     }
5183
5184     void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
5185     {
5186         const Device& dev = ctx.device(0);
5187         createFlags = 0;
5188         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
5189             createFlags |= CL_MEM_ALLOC_HOST_PTR;
5190
5191         if (!isOpenCLCopyingForced() &&
5192             (isOpenCLMapForced() ||
5193                 (dev.hostUnifiedMemory()
5194 #ifndef __APPLE__
5195                 || dev.isIntel()
5196 #endif
5197                 )
5198             )
5199         )
5200             flags0 = static_cast<UMatData::MemoryFlag>(0);
5201         else
5202             flags0 = UMatData::COPY_ON_MAP;
5203     }
5204
5205     UMatData* allocate(int dims, const int* sizes, int type,
5206                        void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5207     {
5208         if(!useOpenCL())
5209             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5210
5211         flushCleanupQueue();
5212
5213         CV_Assert(data == 0);
5214         size_t total = CV_ELEM_SIZE(type);
5215         for( int i = dims-1; i >= 0; i-- )
5216         {
5217             if( step )
5218                 step[i] = total;
5219             total *= sizes[i];
5220         }
5221
5222         Context& ctx = Context::getDefault();
5223         if (!ctx.getImpl())
5224             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5225         Context::Impl& ctxImpl = *ctx.getImpl();
5226
5227         int createFlags = 0;
5228         UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5229         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
5230
5231         void* handle = NULL;
5232         int allocatorFlags = 0;
5233
5234 #ifdef HAVE_OPENCL_SVM
5235         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5236         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
5237         {
5238             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
5239             handle = ctxImpl.getBufferPoolSVM().allocate(total);
5240
5241             // this property is constant, so single buffer pool can be used here
5242             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5243             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5244         }
5245         else
5246 #endif
5247         if (createFlags == 0)
5248         {
5249             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
5250             handle = ctxImpl.getBufferPool().allocate(total);
5251         }
5252         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
5253         {
5254             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
5255             handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
5256         }
5257         else
5258         {
5259             CV_Assert(handle != NULL); // Unsupported, throw
5260         }
5261
5262         if (!handle)
5263             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
5264
5265         UMatData* u = new UMatData(this);
5266         u->data = 0;
5267         u->size = total;
5268         u->handle = handle;
5269         u->flags = flags0;
5270         u->allocatorFlags_ = allocatorFlags;
5271         u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
5272         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
5273         u->markHostCopyObsolete(true);
5274         opencl_allocator_stats.onAllocate(u->size);
5275         return u;
5276     }
5277
5278     bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
5279     {
5280         if(!u)
5281             return false;
5282
5283         flushCleanupQueue();
5284
5285         UMatDataAutoLock lock(u);
5286
5287         if(u->handle == 0)
5288         {
5289             CV_Assert(u->origdata != 0);
5290             Context& ctx = Context::getDefault();
5291             int createFlags = 0;
5292             UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
5293             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
5294
5295             bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
5296
5297             cl_context ctx_handle = (cl_context)ctx.ptr();
5298             int allocatorFlags = 0;
5299             UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
5300             void* handle = NULL;
5301             cl_int retval = CL_SUCCESS;
5302
5303 #ifdef HAVE_OPENCL_SVM
5304             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
5305             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
5306             if (useSVM && svmCaps.isSupportFineGrainSystem())
5307             {
5308                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
5309                 tempUMatFlags = UMatData::TEMP_UMAT;
5310                 handle = u->origdata;
5311                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
5312             }
5313             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
5314             {
5315                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
5316                 {
5317                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
5318
5319                     cl_svm_mem_flags memFlags = createFlags |
5320                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
5321
5322                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5323                     CV_DbgAssert(svmFns->isValid());
5324
5325                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
5326                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
5327                     CV_Assert(handle);
5328
5329                     cl_command_queue q = NULL;
5330                     if (!isFineGrainBuffer)
5331                     {
5332                         q = (cl_command_queue)Queue::getDefault().ptr();
5333                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
5334                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
5335                                 handle, u->size,
5336                                 0, NULL, NULL);
5337                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5338
5339                     }
5340                     memcpy(handle, u->origdata, u->size);
5341                     if (!isFineGrainBuffer)
5342                     {
5343                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
5344                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
5345                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5346                     }
5347
5348                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
5349                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
5350                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
5351                 }
5352             }
5353             else
5354 #endif
5355             {
5356                 if( copyOnMap )
5357                     accessFlags &= ~ACCESS_FAST;
5358
5359                 tempUMatFlags = UMatData::TEMP_UMAT;
5360                 if (
5361                 #ifdef __APPLE__
5362                     !copyOnMap &&
5363                 #endif
5364                     CV_OPENCL_ENABLE_MEM_USE_HOST_PTR
5365                     // There are OpenCL runtime issues for less aligned data
5366                     && (CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR != 0
5367                         && u->origdata == cv::alignPtr(u->origdata, (int)CV_OPENCL_ALIGNMENT_MEM_USE_HOST_PTR))
5368                     // Avoid sharing of host memory between OpenCL buffers
5369                     && !(u->originalUMatData && u->originalUMatData->handle)
5370                 )
5371                 {
5372                     handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
5373                                             u->size, u->origdata, &retval);
5374                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_USE_HOST_PTR|createFlags, sz=%lld, origdata=%p) => %p",
5375                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5376                 }
5377                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
5378                 {
5379                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
5380                                                u->size, u->origdata, &retval);
5381                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer(CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, sz=%lld, origdata=%p) => %p",
5382                             (long long int)u->size, u->origdata, (void*)handle).c_str());
5383                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
5384                 }
5385             }
5386             CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clCreateBuffer() => %p", (void*)handle).c_str());
5387             if(!handle || retval != CL_SUCCESS)
5388                 return false;
5389             u->handle = handle;
5390             u->prevAllocator = u->currAllocator;
5391             u->currAllocator = this;
5392             u->flags |= tempUMatFlags | flags0;
5393             u->allocatorFlags_ = allocatorFlags;
5394         }
5395         if (!!(accessFlags & ACCESS_WRITE))
5396             u->markHostCopyObsolete(true);
5397         opencl_allocator_stats.onAllocate(u->size);
5398         return true;
5399     }
5400
5401     /*void sync(UMatData* u) const
5402     {
5403         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5404         UMatDataAutoLock lock(u);
5405
5406         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
5407         {
5408             if( u->tempCopiedUMat() )
5409             {
5410                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5411                                     u->size, u->origdata, 0, 0, 0);
5412             }
5413             else
5414             {
5415                 cl_int retval = 0;
5416                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5417                                                 (CL_MAP_READ | CL_MAP_WRITE),
5418                                                 0, u->size, 0, 0, 0, &retval);
5419                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5420                 clFinish(q);
5421             }
5422             u->markHostCopyObsolete(false);
5423         }
5424         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
5425         {
5426             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5427                                  u->size, u->data, 0, 0, 0);
5428         }
5429     }*/
5430
5431     void deallocate(UMatData* u) const CV_OVERRIDE
5432     {
5433         if(!u)
5434             return;
5435
5436         CV_Assert(u->urefcount == 0);
5437         CV_Assert(u->refcount == 0 && "UMat deallocation error: some derived Mat is still alive");
5438
5439         CV_Assert(u->handle != 0);
5440         CV_Assert(u->mapcount == 0);
5441
5442         if (!!(u->flags & UMatData::ASYNC_CLEANUP))
5443             addToCleanupQueue(u);
5444         else
5445             deallocate_(u);
5446     }
5447
5448     void deallocate_(UMatData* u) const
5449     {
5450         CV_Assert(u);
5451         CV_Assert(u->handle);
5452         if ((u->allocatorFlags_ & ALLOCATOR_FLAGS_EXTERNAL_BUFFER) == 0)
5453         {
5454             opencl_allocator_stats.onFree(u->size);
5455         }
5456
5457 #ifdef _WIN32
5458         if (cv::__termination)  // process is not in consistent state (after ExitProcess call) and terminating
5459             return;             // avoid any OpenCL calls
5460 #endif
5461         if(u->tempUMat())
5462         {
5463             CV_Assert(u->origdata);
5464 //            UMatDataAutoLock lock(u);
5465
5466             if (u->hostCopyObsolete())
5467             {
5468 #ifdef HAVE_OPENCL_SVM
5469                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5470                 {
5471                     Context& ctx = Context::getDefault();
5472                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5473                     CV_DbgAssert(svmFns->isValid());
5474
5475                     if( u->tempCopiedUMat() )
5476                     {
5477                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5478                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
5479                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
5480                         cl_command_queue q = NULL;
5481                         if (!isFineGrainBuffer)
5482                         {
5483                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
5484                             q = (cl_command_queue)Queue::getDefault().ptr();
5485                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5486                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5487                                     u->handle, u->size,
5488                                     0, NULL, NULL);
5489                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5490                         }
5491                         clFinish(q);
5492                         memcpy(u->origdata, u->handle, u->size);
5493                         if (!isFineGrainBuffer)
5494                         {
5495                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5496                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5497                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5498                         }
5499                     }
5500                     else
5501                     {
5502                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
5503                         // nothing
5504                     }
5505                 }
5506                 else
5507 #endif
5508                 {
5509                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5510                     if( u->tempCopiedUMat() )
5511                     {
5512                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5513                         CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
5514                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0));
5515                     }
5516                     else
5517                     {
5518                         cl_int retval = 0;
5519                         if (u->tempUMat())
5520                         {
5521                             CV_Assert(u->mapcount == 0);
5522                             flushCleanupQueue(); // workaround for CL_OUT_OF_RESOURCES problem (#9960)
5523                             void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5524                                 (CL_MAP_READ | CL_MAP_WRITE),
5525                                 0, u->size, 0, 0, 0, &retval);
5526                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, data).c_str());
5527                             CV_Assert(u->origdata == data && "Details: https://github.com/opencv/opencv/issues/6293");
5528                             if (u->originalUMatData)
5529                             {
5530                                 CV_Assert(u->originalUMatData->data == data);
5531                             }
5532                             retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
5533                             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, data, (long long int)u->size).c_str());
5534                             CV_OCL_DBG_CHECK(clFinish(q));
5535                         }
5536                     }
5537                 }
5538                 u->markHostCopyObsolete(false);
5539             }
5540             else
5541             {
5542                 // nothing
5543             }
5544 #ifdef HAVE_OPENCL_SVM
5545             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5546             {
5547                 if( u->tempCopiedUMat() )
5548                 {
5549                     Context& ctx = Context::getDefault();
5550                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5551                     CV_DbgAssert(svmFns->isValid());
5552
5553                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
5554                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
5555                 }
5556             }
5557             else
5558 #endif
5559             {
5560                 cl_int retval = clReleaseMemObject((cl_mem)u->handle);
5561                 CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clReleaseMemObject(ptr=%p)", (void*)u->handle).c_str());
5562             }
5563             u->handle = 0;
5564             u->markDeviceCopyObsolete(true);
5565             u->currAllocator = u->prevAllocator;
5566             u->prevAllocator = NULL;
5567             if(u->data && u->copyOnMap() && u->data != u->origdata)
5568                 fastFree(u->data);
5569             u->data = u->origdata;
5570             u->currAllocator->deallocate(u);
5571             u = NULL;
5572         }
5573         else
5574         {
5575             CV_Assert(u->origdata == NULL);
5576             if(u->data && u->copyOnMap() && u->data != u->origdata)
5577             {
5578                 fastFree(u->data);
5579                 u->data = 0;
5580                 u->markHostCopyObsolete(true);
5581             }
5582             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
5583             {
5584                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5585                 CV_Assert(pCtx);
5586                 ocl::Context& ctx = *pCtx.get();
5587                 CV_Assert(ctx.getImpl());
5588                 ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
5589             }
5590             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
5591             {
5592                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5593                 CV_Assert(pCtx);
5594                 ocl::Context& ctx = *pCtx.get();
5595                 CV_Assert(ctx.getImpl());
5596                 ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
5597             }
5598 #ifdef HAVE_OPENCL_SVM
5599             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
5600             {
5601                 std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
5602                 CV_Assert(pCtx);
5603                 ocl::Context& ctx = *pCtx.get();
5604                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
5605                 {
5606                     //nothing
5607                 }
5608                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
5609                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5610                 {
5611                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5612                     CV_DbgAssert(svmFns->isValid());
5613                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5614
5615                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
5616                     {
5617                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5618                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
5619                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5620                     }
5621                 }
5622                 CV_Assert(ctx.getImpl());
5623                 ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
5624             }
5625 #endif
5626             else
5627             {
5628                 CV_OCL_DBG_CHECK(clReleaseMemObject((cl_mem)u->handle));
5629             }
5630             u->handle = 0;
5631             u->markDeviceCopyObsolete(true);
5632             delete u;
5633             u = NULL;
5634         }
5635         CV_Assert(u == NULL);
5636     }
5637
5638     // synchronized call (external UMatDataAutoLock, see UMat::getMat)
5639     void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
5640     {
5641         CV_Assert(u && u->handle);
5642
5643         if (!!(accessFlags & ACCESS_WRITE))
5644             u->markDeviceCopyObsolete(true);
5645
5646         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5647
5648         {
5649             if( !u->copyOnMap() )
5650             {
5651                 // TODO
5652                 // because there can be other map requests for the same UMat with different access flags,
5653                 // we use the universal (read-write) access mode.
5654 #ifdef HAVE_OPENCL_SVM
5655                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5656                 {
5657                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5658                     {
5659                         Context& ctx = Context::getDefault();
5660                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5661                         CV_DbgAssert(svmFns->isValid());
5662
5663                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
5664                         {
5665                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5666                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
5667                                     u->handle, u->size,
5668                                     0, NULL, NULL);
5669                             CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5670                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
5671                         }
5672                     }
5673                     clFinish(q);
5674                     u->data = (uchar*)u->handle;
5675                     u->markHostCopyObsolete(false);
5676                     u->markDeviceMemMapped(true);
5677                     return;
5678                 }
5679 #endif
5680
5681                 cl_int retval = CL_SUCCESS;
5682                 if (!u->deviceMemMapped())
5683                 {
5684                     CV_Assert(u->refcount == 1);
5685                     CV_Assert(u->mapcount++ == 0);
5686                     u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
5687                                                          (CL_MAP_READ | CL_MAP_WRITE),
5688                                                          0, u->size, 0, 0, 0, &retval);
5689                     CV_OCL_DBG_CHECK_RESULT(retval, cv::format("clEnqueueMapBuffer(handle=%p, sz=%lld) => %p", (void*)u->handle, (long long int)u->size, u->data).c_str());
5690                 }
5691                 if (u->data && retval == CL_SUCCESS)
5692                 {
5693                     u->markHostCopyObsolete(false);
5694                     u->markDeviceMemMapped(true);
5695                     return;
5696                 }
5697
5698                 // TODO Is it really a good idea and was it tested well?
5699                 // if map failed, switch to copy-on-map mode for the particular buffer
5700                 u->flags |= UMatData::COPY_ON_MAP;
5701             }
5702
5703             if(!u->data)
5704             {
5705                 u->data = (uchar*)fastMalloc(u->size);
5706                 u->markHostCopyObsolete(true);
5707             }
5708         }
5709
5710         if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
5711         {
5712             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5713 #ifdef HAVE_OPENCL_SVM
5714             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5715 #endif
5716             cl_int retval = clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5717                     0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5718             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueReadBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5719                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5720             u->markHostCopyObsolete(false);
5721         }
5722     }
5723
5724     void unmap(UMatData* u) const CV_OVERRIDE
5725     {
5726         if(!u)
5727             return;
5728
5729
5730         CV_Assert(u->handle != 0);
5731
5732         UMatDataAutoLock autolock(u);
5733
5734         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5735         cl_int retval = 0;
5736         if( !u->copyOnMap() && u->deviceMemMapped() )
5737         {
5738             CV_Assert(u->data != NULL);
5739 #ifdef HAVE_OPENCL_SVM
5740             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5741             {
5742                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5743                 {
5744                     Context& ctx = Context::getDefault();
5745                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5746                     CV_DbgAssert(svmFns->isValid());
5747
5748                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
5749                     {
5750                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5751                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5752                                 0, NULL, NULL);
5753                         CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5754                         clFinish(q);
5755                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
5756                     }
5757                 }
5758                 if (u->refcount == 0)
5759                     u->data = 0;
5760                 u->markDeviceCopyObsolete(false);
5761                 u->markHostCopyObsolete(true);
5762                 return;
5763             }
5764 #endif
5765             if (u->refcount == 0)
5766             {
5767                 CV_Assert(u->mapcount-- == 1);
5768                 retval = clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0);
5769                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueUnmapMemObject(handle=%p, data=%p, [sz=%lld])", (void*)u->handle, u->data, (long long int)u->size).c_str());
5770                 if (Device::getDefault().isAMD())
5771                 {
5772                     // required for multithreaded applications (see stitching test)
5773                     CV_OCL_DBG_CHECK(clFinish(q));
5774                 }
5775                 u->markDeviceMemMapped(false);
5776                 u->data = 0;
5777                 u->markDeviceCopyObsolete(false);
5778                 u->markHostCopyObsolete(true);
5779             }
5780         }
5781         else if( u->copyOnMap() && u->deviceCopyObsolete() )
5782         {
5783             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
5784 #ifdef HAVE_OPENCL_SVM
5785             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
5786 #endif
5787             retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
5788                                 0, u->size, alignedPtr.getAlignedPtr(), 0, 0, 0);
5789             CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, 0, sz=%lld, data=%p, 0, 0, 0)",
5790                     (void*)u->handle, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
5791             u->markDeviceCopyObsolete(false);
5792             u->markHostCopyObsolete(true);
5793         }
5794     }
5795
5796     bool checkContinuous(int dims, const size_t sz[],
5797                          const size_t srcofs[], const size_t srcstep[],
5798                          const size_t dstofs[], const size_t dststep[],
5799                          size_t& total, size_t new_sz[],
5800                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
5801                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
5802     {
5803         bool iscontinuous = true;
5804         srcrawofs = srcofs ? srcofs[dims-1] : 0;
5805         dstrawofs = dstofs ? dstofs[dims-1] : 0;
5806         total = sz[dims-1];
5807         for( int i = dims-2; i >= 0; i-- )
5808         {
5809             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
5810                 iscontinuous = false;
5811             total *= sz[i];
5812             if( srcofs )
5813                 srcrawofs += srcofs[i]*srcstep[i];
5814             if( dstofs )
5815                 dstrawofs += dstofs[i]*dststep[i];
5816         }
5817
5818         if( !iscontinuous )
5819         {
5820             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
5821             if( dims == 2 )
5822             {
5823                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
5824                 // we assume that new_... arrays are initialized by caller
5825                 // with 0's, so there is no else branch
5826                 if( srcofs )
5827                 {
5828                     new_srcofs[0] = srcofs[1];
5829                     new_srcofs[1] = srcofs[0];
5830                     new_srcofs[2] = 0;
5831                 }
5832
5833                 if( dstofs )
5834                 {
5835                     new_dstofs[0] = dstofs[1];
5836                     new_dstofs[1] = dstofs[0];
5837                     new_dstofs[2] = 0;
5838                 }
5839
5840                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
5841                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
5842             }
5843             else
5844             {
5845                 // we could check for dims == 3 here,
5846                 // but from user perspective this one is more informative
5847                 CV_Assert(dims <= 3);
5848                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
5849                 if( srcofs )
5850                 {
5851                     new_srcofs[0] = srcofs[2];
5852                     new_srcofs[1] = srcofs[1];
5853                     new_srcofs[2] = srcofs[0];
5854                 }
5855
5856                 if( dstofs )
5857                 {
5858                     new_dstofs[0] = dstofs[2];
5859                     new_dstofs[1] = dstofs[1];
5860                     new_dstofs[2] = dstofs[0];
5861                 }
5862
5863                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
5864                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
5865             }
5866         }
5867         return iscontinuous;
5868     }
5869
5870     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
5871                   const size_t srcofs[], const size_t srcstep[],
5872                   const size_t dststep[]) const CV_OVERRIDE
5873     {
5874         if(!u)
5875             return;
5876         UMatDataAutoLock autolock(u);
5877
5878         if( u->data && !u->hostCopyObsolete() )
5879         {
5880             Mat::getDefaultAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
5881             return;
5882         }
5883         CV_Assert( u->handle != 0 );
5884
5885         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
5886
5887         size_t total = 0, new_sz[] = {0, 0, 0};
5888         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
5889         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
5890
5891         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
5892                                             total, new_sz,
5893                                             srcrawofs, new_srcofs, new_srcstep,
5894                                             dstrawofs, new_dstofs, new_dststep);
5895
5896 #ifdef HAVE_OPENCL_SVM
5897         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
5898         {
5899             CV_DbgAssert(u->data == NULL || u->data == u->handle);
5900             Context& ctx = Context::getDefault();
5901             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
5902             CV_DbgAssert(svmFns->isValid());
5903
5904             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
5905             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5906             {
5907                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
5908                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
5909                         u->handle, u->size,
5910                         0, NULL, NULL);
5911                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
5912             }
5913             clFinish(q);
5914             if( iscontinuous )
5915             {
5916                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
5917             }
5918             else
5919             {
5920                 // This code is from MatAllocator::download()
5921                 int isz[CV_MAX_DIM];
5922                 uchar* srcptr = (uchar*)u->handle;
5923                 for( int i = 0; i < dims; i++ )
5924                 {
5925                     CV_Assert( sz[i] <= (size_t)INT_MAX );
5926                     if( sz[i] == 0 )
5927                     return;
5928                     if( srcofs )
5929                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
5930                     isz[i] = (int)sz[i];
5931                 }
5932
5933                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
5934                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
5935
5936                 const Mat* arrays[] = { &src, &dst };
5937                 uchar* ptrs[2];
5938                 NAryMatIterator it(arrays, ptrs, 2);
5939                 size_t j, planesz = it.size;
5940
5941                 for( j = 0; j < it.nplanes; j++, ++it )
5942                     memcpy(ptrs[1], ptrs[0], planesz);
5943             }
5944             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
5945             {
5946                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
5947                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
5948                         0, NULL, NULL);
5949                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
5950                 clFinish(q);
5951             }
5952         }
5953         else
5954 #endif
5955         {
5956             if( iscontinuous )
5957             {
5958                 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
5959                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5960                     srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0));
5961             }
5962             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
5963             {
5964                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
5965                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
5966                 size_t membuf_ofs = srcrawofs - new_srcrawofs;
5967                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_srcstep[0], new_srcstep[0],
5968                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
5969                 uchar* ptr = alignedPtr.getAlignedPtr();
5970
5971                 CV_Assert(new_srcstep[0] >= new_sz[0]);
5972                 total = alignSize(new_srcstep[0]*new_sz[1] + membuf_ofs, padding);
5973                 total = std::min(total, u->size - new_srcrawofs);
5974                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
5975                                                  new_srcrawofs, total, ptr, 0, 0, 0));
5976                 for( size_t i = 0; i < new_sz[1]; i++ )
5977                     memcpy( (uchar*)dstptr + i*new_dststep[0], ptr + i*new_srcstep[0] + membuf_ofs, new_sz[0]);
5978             }
5979             else
5980             {
5981                 AlignedDataPtr2D<false, true> alignedPtr((uchar*)dstptr, new_sz[1], new_sz[0], new_dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
5982                 uchar* ptr = alignedPtr.getAlignedPtr();
5983
5984                 CV_OCL_CHECK(clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
5985                     new_srcofs, new_dstofs, new_sz,
5986                     new_srcstep[0], 0,
5987                     new_dststep[0], 0,
5988                     ptr, 0, 0, 0));
5989             }
5990         }
5991     }
5992
5993     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
5994                 const size_t dstofs[], const size_t dststep[],
5995                 const size_t srcstep[]) const CV_OVERRIDE
5996     {
5997         if(!u)
5998             return;
5999
6000         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6001         CV_Assert(u->refcount == 0 || u->tempUMat());
6002
6003         size_t total = 0, new_sz[] = {0, 0, 0};
6004         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6005         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6006
6007         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
6008                                             total, new_sz,
6009                                             srcrawofs, new_srcofs, new_srcstep,
6010                                             dstrawofs, new_dstofs, new_dststep);
6011
6012         UMatDataAutoLock autolock(u);
6013
6014         // if there is cached CPU copy of the GPU matrix,
6015         // we could use it as a destination.
6016         // we can do it in 2 cases:
6017         //    1. we overwrite the whole content
6018         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
6019         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
6020         {
6021             Mat::getDefaultAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
6022             u->markHostCopyObsolete(false);
6023             u->markDeviceCopyObsolete(true);
6024             return;
6025         }
6026
6027         CV_Assert( u->handle != 0 );
6028         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6029
6030 #ifdef HAVE_OPENCL_SVM
6031         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6032         {
6033             CV_DbgAssert(u->data == NULL || u->data == u->handle);
6034             Context& ctx = Context::getDefault();
6035             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6036             CV_DbgAssert(svmFns->isValid());
6037
6038             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
6039             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6040             {
6041                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
6042                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
6043                         u->handle, u->size,
6044                         0, NULL, NULL);
6045                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMap()");
6046             }
6047             clFinish(q);
6048             if( iscontinuous )
6049             {
6050                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
6051             }
6052             else
6053             {
6054                 // This code is from MatAllocator::upload()
6055                 int isz[CV_MAX_DIM];
6056                 uchar* dstptr = (uchar*)u->handle;
6057                 for( int i = 0; i < dims; i++ )
6058                 {
6059                     CV_Assert( sz[i] <= (size_t)INT_MAX );
6060                     if( sz[i] == 0 )
6061                     return;
6062                     if( dstofs )
6063                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6064                     isz[i] = (int)sz[i];
6065                 }
6066
6067                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
6068                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
6069
6070                 const Mat* arrays[] = { &src, &dst };
6071                 uchar* ptrs[2];
6072                 NAryMatIterator it(arrays, ptrs, 2);
6073                 size_t j, planesz = it.size;
6074
6075                 for( j = 0; j < it.nplanes; j++, ++it )
6076                     memcpy(ptrs[1], ptrs[0], planesz);
6077             }
6078             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
6079             {
6080                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
6081                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
6082                         0, NULL, NULL);
6083                 CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
6084                 clFinish(q);
6085             }
6086         }
6087         else
6088 #endif
6089         {
6090             if( iscontinuous )
6091             {
6092                 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, total, CV_OPENCL_DATA_PTR_ALIGNMENT);
6093                 cl_int retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6094                     dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0);
6095                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueWriteBuffer(q, handle=%p, CL_TRUE, offset=%lld, sz=%lld, data=%p, 0, 0, 0)",
6096                         (void*)u->handle, (long long int)dstrawofs, (long long int)u->size, alignedPtr.getAlignedPtr()).c_str());
6097             }
6098             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6099             {
6100                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6101                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6102                 size_t membuf_ofs = dstrawofs - new_dstrawofs;
6103                 AlignedDataPtr2D<false, false> alignedPtr(0, new_sz[1], new_dststep[0], new_dststep[0],
6104                                                           CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6105                 uchar* ptr = alignedPtr.getAlignedPtr();
6106
6107                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6108                 total = alignSize(new_dststep[0]*new_sz[1] + membuf_ofs, padding);
6109                 total = std::min(total, u->size - new_dstrawofs);
6110                 /*printf("new_sz0=%d, new_sz1=%d, membuf_ofs=%d, total=%d (%08x), new_dstrawofs=%d (%08x)\n",
6111                        (int)new_sz[0], (int)new_sz[1], (int)membuf_ofs,
6112                        (int)total, (int)total, (int)new_dstrawofs, (int)new_dstrawofs);*/
6113                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
6114                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6115                 for( size_t i = 0; i < new_sz[1]; i++ )
6116                     memcpy( ptr + i*new_dststep[0] + membuf_ofs, (uchar*)srcptr + i*new_srcstep[0], new_sz[0]);
6117                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE,
6118                                                  new_dstrawofs, total, ptr, 0, 0, 0));
6119             }
6120             else
6121             {
6122                 AlignedDataPtr2D<true, false> alignedPtr((uchar*)srcptr, new_sz[1], new_sz[0], new_srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
6123                 uchar* ptr = alignedPtr.getAlignedPtr();
6124
6125                 CV_OCL_CHECK(clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
6126                     new_dstofs, new_srcofs, new_sz,
6127                     new_dststep[0], 0,
6128                     new_srcstep[0], 0,
6129                     ptr, 0, 0, 0));
6130             }
6131         }
6132         u->markHostCopyObsolete(true);
6133 #ifdef HAVE_OPENCL_SVM
6134         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6135                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6136         {
6137             // nothing
6138         }
6139         else
6140 #endif
6141         {
6142             u->markHostCopyObsolete(true);
6143         }
6144         u->markDeviceCopyObsolete(false);
6145     }
6146
6147     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
6148               const size_t srcofs[], const size_t srcstep[],
6149               const size_t dstofs[], const size_t dststep[], bool _sync) const CV_OVERRIDE
6150     {
6151         if(!src || !dst)
6152             return;
6153
6154         size_t total = 0, new_sz[] = {0, 0, 0};
6155         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
6156         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
6157
6158         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
6159                                             total, new_sz,
6160                                             srcrawofs, new_srcofs, new_srcstep,
6161                                             dstrawofs, new_dstofs, new_dststep);
6162
6163         UMatDataAutoLock src_autolock(src, dst);
6164
6165         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
6166         {
6167             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6168             return;
6169         }
6170         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
6171         {
6172             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6173             dst->markHostCopyObsolete(false);
6174 #ifdef HAVE_OPENCL_SVM
6175             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6176                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6177             {
6178                 // nothing
6179             }
6180             else
6181 #endif
6182             {
6183                 dst->markDeviceCopyObsolete(true);
6184             }
6185             return;
6186         }
6187
6188         // there should be no user-visible CPU copies of the UMat which we are going to copy to
6189         CV_Assert(dst->refcount == 0);
6190         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6191
6192         cl_int retval = CL_SUCCESS;
6193 #ifdef HAVE_OPENCL_SVM
6194         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
6195                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6196         {
6197             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
6198                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6199             {
6200                 Context& ctx = Context::getDefault();
6201                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
6202                 CV_DbgAssert(svmFns->isValid());
6203
6204                 if( iscontinuous )
6205                 {
6206                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
6207                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
6208                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
6209                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
6210                             total, 0, NULL, NULL);
6211                     CV_OCL_CHECK_RESULT(status, "clEnqueueSVMMemcpy()");
6212                 }
6213                 else
6214                 {
6215                     clFinish(q);
6216                     // This code is from MatAllocator::download()/upload()
6217                     int isz[CV_MAX_DIM];
6218                     uchar* srcptr = (uchar*)src->handle;
6219                     for( int i = 0; i < dims; i++ )
6220                     {
6221                         CV_Assert( sz[i] <= (size_t)INT_MAX );
6222                         if( sz[i] == 0 )
6223                         return;
6224                         if( srcofs )
6225                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
6226                         isz[i] = (int)sz[i];
6227                     }
6228                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
6229
6230                     uchar* dstptr = (uchar*)dst->handle;
6231                     for( int i = 0; i < dims; i++ )
6232                     {
6233                         if( dstofs )
6234                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
6235                     }
6236                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
6237
6238                     const Mat* arrays[] = { &m_src, &m_dst };
6239                     uchar* ptrs[2];
6240                     NAryMatIterator it(arrays, ptrs, 2);
6241                     size_t j, planesz = it.size;
6242
6243                     for( j = 0; j < it.nplanes; j++, ++it )
6244                         memcpy(ptrs[1], ptrs[0], planesz);
6245                 }
6246             }
6247             else
6248             {
6249                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
6250                 {
6251                     map(src, ACCESS_READ);
6252                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
6253                     unmap(src);
6254                 }
6255                 else
6256                 {
6257                     map(dst, ACCESS_WRITE);
6258                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
6259                     unmap(dst);
6260                 }
6261             }
6262         }
6263         else
6264 #endif
6265         {
6266             if( iscontinuous )
6267             {
6268                 retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6269                                                srcrawofs, dstrawofs, total, 0, 0, 0);
6270                 CV_OCL_CHECK_RESULT(retval, cv::format("clEnqueueCopyBuffer(q, src=%p, dst=%p, src_offset=%lld, dst_offset=%lld, sz=%lld, 0, 0, 0)",
6271                         (void*)src->handle, (void*)dst->handle, (long long int)srcrawofs, (long long int)dstrawofs, (long long int)total).c_str());
6272             }
6273             else if (CV_OPENCL_DISABLE_BUFFER_RECT_OPERATIONS)
6274             {
6275                 const size_t padding = CV_OPENCL_DATA_PTR_ALIGNMENT;
6276                 size_t new_srcrawofs = srcrawofs & ~(padding-1);
6277                 size_t srcmembuf_ofs = srcrawofs - new_srcrawofs;
6278                 size_t new_dstrawofs = dstrawofs & ~(padding-1);
6279                 size_t dstmembuf_ofs = dstrawofs - new_dstrawofs;
6280
6281                 AlignedDataPtr2D<false, false> srcBuf(0, new_sz[1], new_srcstep[0], new_srcstep[0],
6282                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6283                 AlignedDataPtr2D<false, false> dstBuf(0, new_sz[1], new_dststep[0], new_dststep[0],
6284                                                       CV_OPENCL_DATA_PTR_ALIGNMENT, padding*2);
6285                 uchar* srcptr = srcBuf.getAlignedPtr();
6286                 uchar* dstptr = dstBuf.getAlignedPtr();
6287
6288                 CV_Assert(new_dststep[0] >= new_sz[0] && new_srcstep[0] >= new_sz[0]);
6289
6290                 size_t src_total = alignSize(new_srcstep[0]*new_sz[1] + srcmembuf_ofs, padding);
6291                 src_total = std::min(src_total, src->size - new_srcrawofs);
6292                 size_t dst_total = alignSize(new_dststep[0]*new_sz[1] + dstmembuf_ofs, padding);
6293                 dst_total = std::min(dst_total, dst->size - new_dstrawofs);
6294
6295                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)src->handle, CL_TRUE,
6296                                                  new_srcrawofs, src_total, srcptr, 0, 0, 0));
6297                 CV_OCL_CHECK(clEnqueueReadBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6298                                                  new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6299
6300                 for( size_t i = 0; i < new_sz[1]; i++ )
6301                     memcpy( dstptr + dstmembuf_ofs + i*new_dststep[0],
6302                             srcptr + srcmembuf_ofs + i*new_srcstep[0], new_sz[0]);
6303                 CV_OCL_CHECK(clEnqueueWriteBuffer(q, (cl_mem)dst->handle, CL_TRUE,
6304                                                   new_dstrawofs, dst_total, dstptr, 0, 0, 0));
6305             }
6306             else
6307             {
6308                 CV_OCL_CHECK(retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
6309                                                    new_srcofs, new_dstofs, new_sz,
6310                                                    new_srcstep[0], 0,
6311                                                    new_dststep[0], 0,
6312                                                    0, 0, 0));
6313             }
6314         }
6315         if (retval == CL_SUCCESS)
6316         {
6317             CV_IMPL_ADD(CV_IMPL_OCL)
6318         }
6319
6320 #ifdef HAVE_OPENCL_SVM
6321         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
6322             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
6323         {
6324             // nothing
6325         }
6326         else
6327 #endif
6328         {
6329             dst->markHostCopyObsolete(true);
6330         }
6331         dst->markDeviceCopyObsolete(false);
6332
6333         if( _sync )
6334         {
6335             CV_OCL_DBG_CHECK(clFinish(q));
6336         }
6337     }
6338
6339     BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
6340     {
6341         ocl::Context ctx = Context::getDefault();
6342         if (ctx.empty())
6343             return NULL;
6344 #ifdef HAVE_OPENCL_SVM
6345         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
6346         {
6347             return &ctx.getImpl()->getBufferPoolSVM();
6348         }
6349 #endif
6350         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
6351         {
6352             return &ctx.getImpl()->getBufferPoolHostPtr();
6353         }
6354         if (id != NULL && strcmp(id, "OCL") != 0)
6355         {
6356             CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
6357         }
6358         return &ctx.getImpl()->getBufferPool();
6359     }
6360
6361     MatAllocator* matStdAllocator;
6362
6363     mutable cv::Mutex cleanupQueueMutex;
6364     mutable std::deque<UMatData*> cleanupQueue;
6365
6366     void flushCleanupQueue() const
6367     {
6368         if (!cleanupQueue.empty())
6369         {
6370             std::deque<UMatData*> q;
6371             {
6372                 cv::AutoLock lock(cleanupQueueMutex);
6373                 q.swap(cleanupQueue);
6374             }
6375             for (std::deque<UMatData*>::const_iterator i = q.begin(); i != q.end(); ++i)
6376             {
6377                 deallocate_(*i);
6378             }
6379         }
6380     }
6381     void addToCleanupQueue(UMatData* u) const
6382     {
6383         //TODO: Validation check: CV_Assert(!u->tempUMat());
6384         {
6385             cv::AutoLock lock(cleanupQueueMutex);
6386             cleanupQueue.push_back(u);
6387         }
6388     }
6389 };
6390
6391 static OpenCLAllocator* getOpenCLAllocator_() // call once guarantee
6392 {
6393     static OpenCLAllocator* g_allocator = new OpenCLAllocator(); // avoid destructor call (using of this object is too wide)
6394     return g_allocator;
6395 }
6396 MatAllocator* getOpenCLAllocator()
6397 {
6398     CV_SINGLETON_LAZY_INIT(MatAllocator, getOpenCLAllocator_())
6399 }
6400
6401 }} // namespace cv::ocl
6402
6403
6404 namespace cv {
6405
6406 // three funcs below are implemented in umatrix.cpp
6407 void setSize( UMat& m, int _dims, const int* _sz, const size_t* _steps,
6408               bool autoSteps = false );
6409 void finalizeHdr(UMat& m);
6410
6411 } // namespace cv
6412
6413
6414 namespace cv { namespace ocl {
6415
6416 /*
6417 // Convert OpenCL buffer memory to UMat
6418 */
6419 void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int type, UMat& dst)
6420 {
6421     int d = 2;
6422     int sizes[] = { rows, cols };
6423
6424     CV_Assert(0 <= d && d <= CV_MAX_DIM);
6425
6426     dst.release();
6427
6428     dst.flags      = (type & Mat::TYPE_MASK) | Mat::MAGIC_VAL;
6429     dst.usageFlags = USAGE_DEFAULT;
6430
6431     setSize(dst, d, sizes, 0, true);
6432     dst.offset = 0;
6433
6434     cl_mem             memobj = (cl_mem)cl_mem_buffer;
6435     cl_mem_object_type mem_type = 0;
6436
6437     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6438
6439     CV_Assert(CL_MEM_OBJECT_BUFFER == mem_type);
6440
6441     size_t total = 0;
6442     CV_OCL_CHECK(clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &total, 0));
6443
6444     CV_OCL_CHECK(clRetainMemObject(memobj));
6445
6446     CV_Assert((int)step >= cols * CV_ELEM_SIZE(type));
6447     CV_Assert(total >= rows * step);
6448
6449     // attach clBuffer to UMatData
6450     dst.u = new UMatData(getOpenCLAllocator());
6451     dst.u->data            = 0;
6452     dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
6453     dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
6454     dst.u->handle          = cl_mem_buffer;
6455     dst.u->origdata        = 0;
6456     dst.u->prevAllocator   = 0;
6457     dst.u->size            = total;
6458
6459     finalizeHdr(dst);
6460     dst.addref();
6461
6462     return;
6463 } // convertFromBuffer()
6464
6465
6466 /*
6467 // Convert OpenCL image2d_t memory to UMat
6468 */
6469 void convertFromImage(void* cl_mem_image, UMat& dst)
6470 {
6471     cl_mem             clImage = (cl_mem)cl_mem_image;
6472     cl_mem_object_type mem_type = 0;
6473
6474     CV_OCL_CHECK(clGetMemObjectInfo(clImage, CL_MEM_TYPE, sizeof(cl_mem_object_type), &mem_type, 0));
6475
6476     CV_Assert(CL_MEM_OBJECT_IMAGE2D == mem_type);
6477
6478     cl_image_format fmt = { 0, 0 };
6479     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_FORMAT, sizeof(cl_image_format), &fmt, 0));
6480
6481     int depth = CV_8U;
6482     switch (fmt.image_channel_data_type)
6483     {
6484     case CL_UNORM_INT8:
6485     case CL_UNSIGNED_INT8:
6486         depth = CV_8U;
6487         break;
6488
6489     case CL_SNORM_INT8:
6490     case CL_SIGNED_INT8:
6491         depth = CV_8S;
6492         break;
6493
6494     case CL_UNORM_INT16:
6495     case CL_UNSIGNED_INT16:
6496         depth = CV_16U;
6497         break;
6498
6499     case CL_SNORM_INT16:
6500     case CL_SIGNED_INT16:
6501         depth = CV_16S;
6502         break;
6503
6504     case CL_SIGNED_INT32:
6505         depth = CV_32S;
6506         break;
6507
6508     case CL_FLOAT:
6509         depth = CV_32F;
6510         break;
6511
6512     default:
6513         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_data_type");
6514     }
6515
6516     int type = CV_8UC1;
6517     switch (fmt.image_channel_order)
6518     {
6519     case CL_R:
6520         type = CV_MAKE_TYPE(depth, 1);
6521         break;
6522
6523     case CL_RGBA:
6524     case CL_BGRA:
6525     case CL_ARGB:
6526         type = CV_MAKE_TYPE(depth, 4);
6527         break;
6528
6529     default:
6530         CV_Error(cv::Error::OpenCLApiCallError, "Not supported image_channel_order");
6531         break;
6532     }
6533
6534     size_t step = 0;
6535     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_ROW_PITCH, sizeof(size_t), &step, 0));
6536
6537     size_t w = 0;
6538     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_WIDTH, sizeof(size_t), &w, 0));
6539
6540     size_t h = 0;
6541     CV_OCL_CHECK(clGetImageInfo(clImage, CL_IMAGE_HEIGHT, sizeof(size_t), &h, 0));
6542
6543     dst.create((int)h, (int)w, type);
6544
6545     cl_mem clBuffer = (cl_mem)dst.handle(ACCESS_READ);
6546
6547     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
6548
6549     size_t offset = 0;
6550     size_t src_origin[3] = { 0, 0, 0 };
6551     size_t region[3] = { w, h, 1 };
6552     CV_OCL_CHECK(clEnqueueCopyImageToBuffer(q, clImage, clBuffer, src_origin, region, offset, 0, NULL, NULL));
6553
6554     CV_OCL_CHECK(clFinish(q));
6555
6556     return;
6557 } // convertFromImage()
6558
6559
6560 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
6561
6562 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
6563 {
6564     cl_uint numDevices = 0;
6565     cl_int status = clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 0, NULL, &numDevices);
6566     if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
6567     {
6568         CV_OCL_DBG_CHECK_RESULT(status,
6569             cv::format("clGetDeviceIDs(platform, Device::TYPE_ALL, num_entries=0, devices=NULL, numDevices=%p)", &numDevices).c_str());
6570     }
6571
6572     if (numDevices == 0)
6573     {
6574         devices.clear();
6575         return;
6576     }
6577
6578     devices.resize((size_t)numDevices);
6579     CV_OCL_DBG_CHECK(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, numDevices, &devices[0], &numDevices));
6580 }
6581
6582 struct PlatformInfo::Impl
6583 {
6584     Impl(void* id)
6585     {
6586         refcount = 1;
6587         handle = *(cl_platform_id*)id;
6588         getDevices(devices, handle);
6589
6590         version_ = getStrProp(CL_PLATFORM_VERSION);
6591         parseOpenCLVersion(version_, versionMajor_, versionMinor_);
6592     }
6593
6594     String getStrProp(cl_platform_info prop) const
6595     {
6596         char buf[1024];
6597         size_t sz=0;
6598         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
6599             sz < sizeof(buf) ? String(buf) : String();
6600     }
6601
6602     IMPLEMENT_REFCOUNTABLE();
6603     std::vector<cl_device_id> devices;
6604     cl_platform_id handle;
6605
6606     String version_;
6607     int versionMajor_;
6608     int versionMinor_;
6609 };
6610
6611 PlatformInfo::PlatformInfo()
6612 {
6613     p = 0;
6614 }
6615
6616 PlatformInfo::PlatformInfo(void* platform_id)
6617 {
6618     p = new Impl(platform_id);
6619 }
6620
6621 PlatformInfo::~PlatformInfo()
6622 {
6623     if(p)
6624         p->release();
6625 }
6626
6627 PlatformInfo::PlatformInfo(const PlatformInfo& i)
6628 {
6629     if (i.p)
6630         i.p->addref();
6631     p = i.p;
6632 }
6633
6634 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
6635 {
6636     if (i.p != p)
6637     {
6638         if (i.p)
6639             i.p->addref();
6640         if (p)
6641             p->release();
6642         p = i.p;
6643     }
6644     return *this;
6645 }
6646
6647 int PlatformInfo::deviceNumber() const
6648 {
6649     return p ? (int)p->devices.size() : 0;
6650 }
6651
6652 void PlatformInfo::getDevice(Device& device, int d) const
6653 {
6654     CV_Assert(p && d < (int)p->devices.size() );
6655     if(p)
6656         device.set(p->devices[d]);
6657 }
6658
6659 String PlatformInfo::name() const
6660 {
6661     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
6662 }
6663
6664 String PlatformInfo::vendor() const
6665 {
6666     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
6667 }
6668
6669 String PlatformInfo::version() const
6670 {
6671     return p ? p->version_ : String();
6672 }
6673
6674 int PlatformInfo::versionMajor() const
6675 {
6676     CV_Assert(p);
6677     return p->versionMajor_;
6678 }
6679
6680 int PlatformInfo::versionMinor() const
6681 {
6682     CV_Assert(p);
6683     return p->versionMinor_;
6684 }
6685
6686 static void getPlatforms(std::vector<cl_platform_id>& platforms)
6687 {
6688     cl_uint numPlatforms = 0;
6689     CV_OCL_DBG_CHECK(clGetPlatformIDs(0, NULL, &numPlatforms));
6690
6691     if (numPlatforms == 0)
6692     {
6693         platforms.clear();
6694         return;
6695     }
6696
6697     platforms.resize((size_t)numPlatforms);
6698     CV_OCL_DBG_CHECK(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms));
6699 }
6700
6701 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
6702 {
6703     std::vector<cl_platform_id> platforms;
6704     getPlatforms(platforms);
6705
6706     for (size_t i = 0; i < platforms.size(); i++)
6707         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
6708 }
6709
6710 const char* typeToStr(int type)
6711 {
6712     static const char* tab[]=
6713     {
6714         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6715         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6716         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6717         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6718         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6719         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
6720         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
6721         "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
6722         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6723     };
6724     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6725     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6726     CV_Assert(result);
6727     return result;
6728 }
6729
6730 const char* memopTypeToStr(int type)
6731 {
6732     static const char* tab[] =
6733     {
6734         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
6735         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
6736         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
6737         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6738         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6739         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6740         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6741         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6742         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6743     };
6744     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6745     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6746     CV_Assert(result);
6747     return result;
6748 }
6749
6750 const char* vecopTypeToStr(int type)
6751 {
6752     static const char* tab[] =
6753     {
6754         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6755         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
6756         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6757         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
6758         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6759         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
6760         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
6761         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
6762         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
6763     };
6764     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
6765     const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
6766     CV_Assert(result);
6767     return result;
6768 }
6769
6770 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
6771 {
6772     if( sdepth == ddepth )
6773         return "noconvert";
6774     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
6775     if( ddepth >= CV_32F ||
6776         (ddepth == CV_32S && sdepth < CV_32S) ||
6777         (ddepth == CV_16S && sdepth <= CV_8S) ||
6778         (ddepth == CV_16U && sdepth == CV_8U))
6779     {
6780         sprintf(buf, "convert_%s", typestr);
6781     }
6782     else if( sdepth >= CV_32F )
6783         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
6784     else
6785         sprintf(buf, "convert_%s_sat", typestr);
6786
6787     return buf;
6788 }
6789
6790 const char* getOpenCLErrorString(int errorCode)
6791 {
6792 #define CV_OCL_CODE(id) case id: return #id
6793 #define CV_OCL_CODE_(id, name) case id: return #name
6794     switch (errorCode)
6795     {
6796     CV_OCL_CODE(CL_SUCCESS);
6797     CV_OCL_CODE(CL_DEVICE_NOT_FOUND);
6798     CV_OCL_CODE(CL_DEVICE_NOT_AVAILABLE);
6799     CV_OCL_CODE(CL_COMPILER_NOT_AVAILABLE);
6800     CV_OCL_CODE(CL_MEM_OBJECT_ALLOCATION_FAILURE);
6801     CV_OCL_CODE(CL_OUT_OF_RESOURCES);
6802     CV_OCL_CODE(CL_OUT_OF_HOST_MEMORY);
6803     CV_OCL_CODE(CL_PROFILING_INFO_NOT_AVAILABLE);
6804     CV_OCL_CODE(CL_MEM_COPY_OVERLAP);
6805     CV_OCL_CODE(CL_IMAGE_FORMAT_MISMATCH);
6806     CV_OCL_CODE(CL_IMAGE_FORMAT_NOT_SUPPORTED);
6807     CV_OCL_CODE(CL_BUILD_PROGRAM_FAILURE);
6808     CV_OCL_CODE(CL_MAP_FAILURE);
6809     CV_OCL_CODE(CL_MISALIGNED_SUB_BUFFER_OFFSET);
6810     CV_OCL_CODE(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
6811     CV_OCL_CODE(CL_COMPILE_PROGRAM_FAILURE);
6812     CV_OCL_CODE(CL_LINKER_NOT_AVAILABLE);
6813     CV_OCL_CODE(CL_LINK_PROGRAM_FAILURE);
6814     CV_OCL_CODE(CL_DEVICE_PARTITION_FAILED);
6815     CV_OCL_CODE(CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
6816     CV_OCL_CODE(CL_INVALID_VALUE);
6817     CV_OCL_CODE(CL_INVALID_DEVICE_TYPE);
6818     CV_OCL_CODE(CL_INVALID_PLATFORM);
6819     CV_OCL_CODE(CL_INVALID_DEVICE);
6820     CV_OCL_CODE(CL_INVALID_CONTEXT);
6821     CV_OCL_CODE(CL_INVALID_QUEUE_PROPERTIES);
6822     CV_OCL_CODE(CL_INVALID_COMMAND_QUEUE);
6823     CV_OCL_CODE(CL_INVALID_HOST_PTR);
6824     CV_OCL_CODE(CL_INVALID_MEM_OBJECT);
6825     CV_OCL_CODE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
6826     CV_OCL_CODE(CL_INVALID_IMAGE_SIZE);
6827     CV_OCL_CODE(CL_INVALID_SAMPLER);
6828     CV_OCL_CODE(CL_INVALID_BINARY);
6829     CV_OCL_CODE(CL_INVALID_BUILD_OPTIONS);
6830     CV_OCL_CODE(CL_INVALID_PROGRAM);
6831     CV_OCL_CODE(CL_INVALID_PROGRAM_EXECUTABLE);
6832     CV_OCL_CODE(CL_INVALID_KERNEL_NAME);
6833     CV_OCL_CODE(CL_INVALID_KERNEL_DEFINITION);
6834     CV_OCL_CODE(CL_INVALID_KERNEL);
6835     CV_OCL_CODE(CL_INVALID_ARG_INDEX);
6836     CV_OCL_CODE(CL_INVALID_ARG_VALUE);
6837     CV_OCL_CODE(CL_INVALID_ARG_SIZE);
6838     CV_OCL_CODE(CL_INVALID_KERNEL_ARGS);
6839     CV_OCL_CODE(CL_INVALID_WORK_DIMENSION);
6840     CV_OCL_CODE(CL_INVALID_WORK_GROUP_SIZE);
6841     CV_OCL_CODE(CL_INVALID_WORK_ITEM_SIZE);
6842     CV_OCL_CODE(CL_INVALID_GLOBAL_OFFSET);
6843     CV_OCL_CODE(CL_INVALID_EVENT_WAIT_LIST);
6844     CV_OCL_CODE(CL_INVALID_EVENT);
6845     CV_OCL_CODE(CL_INVALID_OPERATION);
6846     CV_OCL_CODE(CL_INVALID_GL_OBJECT);
6847     CV_OCL_CODE(CL_INVALID_BUFFER_SIZE);
6848     CV_OCL_CODE(CL_INVALID_MIP_LEVEL);
6849     CV_OCL_CODE(CL_INVALID_GLOBAL_WORK_SIZE);
6850     // OpenCL 1.1
6851     CV_OCL_CODE(CL_INVALID_PROPERTY);
6852     // OpenCL 1.2
6853     CV_OCL_CODE(CL_INVALID_IMAGE_DESCRIPTOR);
6854     CV_OCL_CODE(CL_INVALID_COMPILER_OPTIONS);
6855     CV_OCL_CODE(CL_INVALID_LINKER_OPTIONS);
6856     CV_OCL_CODE(CL_INVALID_DEVICE_PARTITION_COUNT);
6857     // OpenCL 2.0
6858     CV_OCL_CODE_(-69, CL_INVALID_PIPE_SIZE);
6859     CV_OCL_CODE_(-70, CL_INVALID_DEVICE_QUEUE);
6860     // Extensions
6861     CV_OCL_CODE_(-1000, CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR);
6862     CV_OCL_CODE_(-1001, CL_PLATFORM_NOT_FOUND_KHR);
6863     CV_OCL_CODE_(-1002, CL_INVALID_D3D10_DEVICE_KHR);
6864     CV_OCL_CODE_(-1003, CL_INVALID_D3D10_RESOURCE_KHR);
6865     CV_OCL_CODE_(-1004, CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR);
6866     CV_OCL_CODE_(-1005, CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR);
6867     default: return "Unknown OpenCL error";
6868     }
6869 #undef CV_OCL_CODE
6870 #undef CV_OCL_CODE_
6871 }
6872
6873 template <typename T>
6874 static std::string kerToStr(const Mat & k)
6875 {
6876     int width = k.cols - 1, depth = k.depth();
6877     const T * const data = k.ptr<T>();
6878
6879     std::ostringstream stream;
6880     stream.precision(10);
6881
6882     if (depth <= CV_8S)
6883     {
6884         for (int i = 0; i < width; ++i)
6885             stream << "DIG(" << (int)data[i] << ")";
6886         stream << "DIG(" << (int)data[width] << ")";
6887     }
6888     else if (depth == CV_32F)
6889     {
6890         stream.setf(std::ios_base::showpoint);
6891         for (int i = 0; i < width; ++i)
6892             stream << "DIG(" << data[i] << "f)";
6893         stream << "DIG(" << data[width] << "f)";
6894     }
6895     else
6896     {
6897         for (int i = 0; i < width; ++i)
6898             stream << "DIG(" << data[i] << ")";
6899         stream << "DIG(" << data[width] << ")";
6900     }
6901
6902     return stream.str();
6903 }
6904
6905 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
6906 {
6907     Mat kernel = _kernel.getMat().reshape(1, 1);
6908
6909     int depth = kernel.depth();
6910     if (ddepth < 0)
6911         ddepth = depth;
6912
6913     if (ddepth != depth)
6914         kernel.convertTo(kernel, ddepth);
6915
6916     typedef std::string (* func_t)(const Mat &);
6917     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
6918                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
6919     const func_t func = funcs[ddepth];
6920     CV_Assert(func != 0);
6921
6922     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
6923 }
6924
6925 #define PROCESS_SRC(src) \
6926     do \
6927     { \
6928         if (!src.empty()) \
6929         { \
6930             CV_Assert(src.isMat() || src.isUMat()); \
6931             Size csize = src.size(); \
6932             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
6933                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
6934             if (cwidth < ckercn || ckercn <= 0) \
6935                 return 1; \
6936             cols.push_back(cwidth); \
6937             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
6938                 return 1; \
6939             offsets.push_back(src.offset()); \
6940             steps.push_back(src.step()); \
6941             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
6942             kercns.push_back(ckercn); \
6943         } \
6944     } \
6945     while ((void)0, 0)
6946
6947 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
6948                               InputArray src4, InputArray src5, InputArray src6,
6949                               InputArray src7, InputArray src8, InputArray src9,
6950                               OclVectorStrategy strat)
6951 {
6952     const ocl::Device & d = ocl::Device::getDefault();
6953
6954     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
6955         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
6956         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
6957         d.preferredVectorWidthDouble(), -1 };
6958
6959     // if the device says don't use vectors
6960     if (vectorWidths[0] == 1)
6961     {
6962         // it's heuristic
6963         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
6964         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
6965         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
6966     }
6967
6968     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
6969 }
6970
6971 int checkOptimalVectorWidth(const int *vectorWidths,
6972                             InputArray src1, InputArray src2, InputArray src3,
6973                             InputArray src4, InputArray src5, InputArray src6,
6974                             InputArray src7, InputArray src8, InputArray src9,
6975                             OclVectorStrategy strat)
6976 {
6977     CV_Assert(vectorWidths);
6978
6979     int ref_type = src1.type();
6980
6981     std::vector<size_t> offsets, steps, cols;
6982     std::vector<int> dividers, kercns;
6983     PROCESS_SRC(src1);
6984     PROCESS_SRC(src2);
6985     PROCESS_SRC(src3);
6986     PROCESS_SRC(src4);
6987     PROCESS_SRC(src5);
6988     PROCESS_SRC(src6);
6989     PROCESS_SRC(src7);
6990     PROCESS_SRC(src8);
6991     PROCESS_SRC(src9);
6992
6993     size_t size = offsets.size();
6994
6995     for (size_t i = 0; i < size; ++i)
6996         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
6997             dividers[i] >>= 1, kercns[i] >>= 1;
6998
6999     // default strategy
7000     int kercn = *std::min_element(kercns.begin(), kercns.end());
7001
7002     return kercn;
7003 }
7004
7005 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
7006                                  InputArray src4, InputArray src5, InputArray src6,
7007                                  InputArray src7, InputArray src8, InputArray src9)
7008 {
7009     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
7010 }
7011
7012 #undef PROCESS_SRC
7013
7014
7015 // TODO Make this as a method of OpenCL "BuildOptions" class
7016 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
7017 {
7018     if (!buildOptions.empty())
7019         buildOptions += " ";
7020     int type = _m.type(), depth = CV_MAT_DEPTH(type);
7021     buildOptions += format(
7022             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
7023             name.c_str(), ocl::typeToStr(type),
7024             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
7025             name.c_str(), (int)CV_MAT_CN(type),
7026             name.c_str(), (int)CV_ELEM_SIZE(type),
7027             name.c_str(), (int)CV_ELEM_SIZE1(type),
7028             name.c_str(), (int)depth
7029             );
7030 }
7031
7032
7033 struct Image2D::Impl
7034 {
7035     Impl(const UMat &src, bool norm, bool alias)
7036     {
7037         handle = 0;
7038         refcount = 1;
7039         init(src, norm, alias);
7040     }
7041
7042     ~Impl()
7043     {
7044         if (handle)
7045             clReleaseMemObject(handle);
7046     }
7047
7048     static cl_image_format getImageFormat(int depth, int cn, bool norm)
7049     {
7050         cl_image_format format;
7051         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
7052                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
7053         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
7054                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
7055         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
7056
7057         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
7058         int channelOrder = channelOrders[cn];
7059         format.image_channel_data_type = (cl_channel_type)channelType;
7060         format.image_channel_order = (cl_channel_order)channelOrder;
7061         return format;
7062     }
7063
7064     static bool isFormatSupported(cl_image_format format)
7065     {
7066         if (!haveOpenCL())
7067             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7068
7069         cl_context context = (cl_context)Context::getDefault().ptr();
7070         if (!context)
7071             return false;
7072
7073         // Figure out how many formats are supported by this context.
7074         cl_uint numFormats = 0;
7075         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7076                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
7077                                                 NULL, &numFormats);
7078         CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, NULL)");
7079         if (numFormats > 0)
7080         {
7081             AutoBuffer<cl_image_format> formats(numFormats);
7082             err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
7083                                              CL_MEM_OBJECT_IMAGE2D, numFormats,
7084                                              formats.data(), NULL);
7085             CV_OCL_DBG_CHECK_RESULT(err, "clGetSupportedImageFormats(CL_MEM_OBJECT_IMAGE2D, formats)");
7086             for (cl_uint i = 0; i < numFormats; ++i)
7087             {
7088                 if (!memcmp(&formats[i], &format, sizeof(format)))
7089                 {
7090                     return true;
7091                 }
7092             }
7093         }
7094         return false;
7095     }
7096
7097     void init(const UMat &src, bool norm, bool alias)
7098     {
7099         if (!haveOpenCL())
7100             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
7101
7102         CV_Assert(!src.empty());
7103         CV_Assert(ocl::Device::getDefault().imageSupport());
7104
7105         int err, depth = src.depth(), cn = src.channels();
7106         CV_Assert(cn <= 4);
7107         cl_image_format format = getImageFormat(depth, cn, norm);
7108
7109         if (!isFormatSupported(format))
7110             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
7111
7112         if (alias && !src.handle(ACCESS_RW))
7113             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
7114
7115         cl_context context = (cl_context)Context::getDefault().ptr();
7116         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
7117
7118 #ifdef CL_VERSION_1_2
7119         // this enables backwards portability to
7120         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
7121         const Device & d = ocl::Device::getDefault();
7122         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
7123         CV_Assert(!alias || canCreateAlias(src));
7124         if (1 < major || (1 == major && 2 <= minor))
7125         {
7126             cl_image_desc desc;
7127             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
7128             desc.image_width      = src.cols;
7129             desc.image_height     = src.rows;
7130             desc.image_depth      = 0;
7131             desc.image_array_size = 1;
7132             desc.image_row_pitch  = alias ? src.step[0] : 0;
7133             desc.image_slice_pitch = 0;
7134             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
7135             desc.num_mip_levels   = 0;
7136             desc.num_samples      = 0;
7137             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
7138         }
7139         else
7140 #endif
7141         {
7142             CV_SUPPRESS_DEPRECATED_START
7143             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
7144             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
7145             CV_SUPPRESS_DEPRECATED_END
7146         }
7147         CV_OCL_DBG_CHECK_RESULT(err, "clCreateImage()");
7148
7149         size_t origin[] = { 0, 0, 0 };
7150         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
7151
7152         cl_mem devData;
7153         if (!alias && !src.isContinuous())
7154         {
7155             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
7156             CV_OCL_CHECK_RESULT(err, cv::format("clCreateBuffer(CL_MEM_READ_ONLY, sz=%lld) => %p",
7157                     (long long int)(src.cols * src.rows * src.elemSize()), (void*)devData
7158                 ).c_str());
7159
7160             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
7161             CV_OCL_CHECK(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
7162                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL));
7163             CV_OCL_DBG_CHECK(clFlush(queue));
7164         }
7165         else
7166         {
7167             devData = (cl_mem)src.handle(ACCESS_READ);
7168         }
7169         CV_Assert(devData != NULL);
7170
7171         if (!alias)
7172         {
7173             CV_OCL_CHECK(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0));
7174             if (!src.isContinuous())
7175             {
7176                 CV_OCL_DBG_CHECK(clFlush(queue));
7177                 CV_OCL_DBG_CHECK(clReleaseMemObject(devData));
7178             }
7179         }
7180     }
7181
7182     IMPLEMENT_REFCOUNTABLE();
7183
7184     cl_mem handle;
7185 };
7186
7187 Image2D::Image2D()
7188 {
7189     p = NULL;
7190 }
7191
7192 Image2D::Image2D(const UMat &src, bool norm, bool alias)
7193 {
7194     p = new Impl(src, norm, alias);
7195 }
7196
7197 bool Image2D::canCreateAlias(const UMat &m)
7198 {
7199     bool ret = false;
7200     const Device & d = ocl::Device::getDefault();
7201     if (d.imageFromBufferSupport() && !m.empty())
7202     {
7203         // This is the required pitch alignment in pixels
7204         uint pitchAlign = d.imagePitchAlignment();
7205         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
7206         {
7207             // We don't currently handle the case where the buffer was created
7208             // with CL_MEM_USE_HOST_PTR
7209             if (!m.u->tempUMat())
7210             {
7211                 ret = true;
7212             }
7213         }
7214     }
7215     return ret;
7216 }
7217
7218 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
7219 {
7220     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
7221
7222     return Impl::isFormatSupported(format);
7223 }
7224
7225 Image2D::Image2D(const Image2D & i)
7226 {
7227     p = i.p;
7228     if (p)
7229         p->addref();
7230 }
7231
7232 Image2D & Image2D::operator = (const Image2D & i)
7233 {
7234     if (i.p != p)
7235     {
7236         if (i.p)
7237             i.p->addref();
7238         if (p)
7239             p->release();
7240         p = i.p;
7241     }
7242     return *this;
7243 }
7244
7245 Image2D::~Image2D()
7246 {
7247     if (p)
7248         p->release();
7249 }
7250
7251 void* Image2D::ptr() const
7252 {
7253     return p ? p->handle : 0;
7254 }
7255
7256 bool internal::isOpenCLForced()
7257 {
7258     static bool initialized = false;
7259     static bool value = false;
7260     if (!initialized)
7261     {
7262         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_FORCE", false);
7263         initialized = true;
7264     }
7265     return value;
7266 }
7267
7268 bool internal::isPerformanceCheckBypassed()
7269 {
7270     static bool initialized = false;
7271     static bool value = false;
7272     if (!initialized)
7273     {
7274         value = utils::getConfigurationParameterBool("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
7275         initialized = true;
7276     }
7277     return value;
7278 }
7279
7280 bool internal::isCLBuffer(UMat& u)
7281 {
7282     void* h = u.handle(ACCESS_RW);
7283     if (!h)
7284         return true;
7285     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
7286 #if 1
7287     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
7288         return false;
7289 #else
7290     cl_mem_object_type type = 0;
7291     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
7292     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
7293         return false;
7294 #endif
7295     return true;
7296 }
7297
7298 struct Timer::Impl
7299 {
7300     const Queue queue;
7301
7302     Impl(const Queue& q)
7303         : queue(q)
7304     {
7305     }
7306
7307     ~Impl(){}
7308
7309     void start()
7310     {
7311         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7312         timer.start();
7313     }
7314
7315     void stop()
7316     {
7317         CV_OCL_DBG_CHECK(clFinish((cl_command_queue)queue.ptr()));
7318         timer.stop();
7319     }
7320
7321     uint64 durationNS() const
7322     {
7323         return (uint64)(timer.getTimeSec() * 1e9);
7324     }
7325
7326     TickMeter timer;
7327 };
7328
7329 Timer::Timer(const Queue& q) : p(new Impl(q)) { }
7330 Timer::~Timer() { delete p; }
7331
7332 void Timer::start()
7333 {
7334     CV_Assert(p);
7335     p->start();
7336 }
7337
7338 void Timer::stop()
7339 {
7340     CV_Assert(p);
7341     p->stop();
7342 }
7343
7344 uint64 Timer::durationNS() const
7345 {
7346     CV_Assert(p);
7347     return p->durationNS();
7348 }
7349
7350 }} // namespace
7351
7352 #ifdef HAVE_DIRECTX
7353 namespace cv { namespace directx { namespace internal {
7354 OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
7355 {
7356     ocl::Context::Impl* i = ctx.getImpl();
7357     CV_Assert(i);
7358     return i->getDirectXImpl();
7359 }
7360 }}} // namespace cv::directx::internal
7361 #endif
7362
7363 #endif // HAVE_OPENCL