Doc: update video processing tutorial code for OpenCV v2.4.9 and v3a
[profile/ivi/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <sstream>
47 #include <iostream> // std::cerr
48
49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50
51 #include "opencv2/core/bufferpool.hpp"
52 #ifndef LOG_BUFFER_POOL
53 # if 0
54 #   define LOG_BUFFER_POOL printf
55 # else
56 #   define LOG_BUFFER_POOL(...)
57 # endif
58 #endif
59
60
61 // TODO Move to some common place
62 static bool getBoolParameter(const char* name, bool defaultValue)
63 {
64     const char* envValue = getenv(name);
65     if (envValue == NULL)
66     {
67         return defaultValue;
68     }
69     cv::String value = envValue;
70     if (value == "1" || value == "True" || value == "true" || value == "TRUE")
71     {
72         return true;
73     }
74     if (value == "0" || value == "False" || value == "false" || value == "FALSE")
75     {
76         return false;
77     }
78     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
79 }
80
81
82 // TODO Move to some common place
83 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
84 {
85 #ifdef HAVE_WINRT
86     const char* envValue = NULL;
87 #else
88     const char* envValue = getenv(name);
89 #endif
90     if (envValue == NULL)
91     {
92         return defaultValue;
93     }
94     cv::String value = envValue;
95     size_t pos = 0;
96     for (; pos < value.size(); pos++)
97     {
98         if (!isdigit(value[pos]))
99             break;
100     }
101     cv::String valueStr = value.substr(0, pos);
102     cv::String suffixStr = value.substr(pos, value.length() - pos);
103     int v = atoi(valueStr.c_str());
104     if (suffixStr.length() == 0)
105         return v;
106     else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
107         return v * 1024 * 1024;
108     else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
109         return v * 1024;
110     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
111 }
112
113 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
114 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
115
116 #ifdef HAVE_OPENCL
117 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
118 #else
119 // TODO FIXIT: This file can't be build without OPENCL
120
121 /*
122   Part of the file is an extract from the standard OpenCL headers from Khronos site.
123   Below is the original copyright.
124 */
125
126 /*******************************************************************************
127  * Copyright (c) 2008 - 2012 The Khronos Group Inc.
128  *
129  * Permission is hereby granted, free of charge, to any person obtaining a
130  * copy of this software and/or associated documentation files (the
131  * "Materials"), to deal in the Materials without restriction, including
132  * without limitation the rights to use, copy, modify, merge, publish,
133  * distribute, sublicense, and/or sell copies of the Materials, and to
134  * permit persons to whom the Materials are furnished to do so, subject to
135  * the following conditions:
136  *
137  * The above copyright notice and this permission notice shall be included
138  * in all copies or substantial portions of the Materials.
139  *
140  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
141  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
142  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
143  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
144  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
145  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
146  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
147  ******************************************************************************/
148
149 #if 0 //defined __APPLE__
150 #define HAVE_OPENCL 1
151 #else
152 #undef HAVE_OPENCL
153 #endif
154
155 #define OPENCV_CL_NOT_IMPLEMENTED -1000
156
157 #ifdef HAVE_OPENCL
158
159 #if defined __APPLE__
160 #include <OpenCL/opencl.h>
161 #else
162 #include <CL/opencl.h>
163 #endif
164
165 static const bool g_haveOpenCL = true;
166
167 #else
168
169 extern "C" {
170
171 struct _cl_platform_id { int dummy; };
172 struct _cl_device_id { int dummy; };
173 struct _cl_context { int dummy; };
174 struct _cl_command_queue { int dummy; };
175 struct _cl_mem { int dummy; };
176 struct _cl_program { int dummy; };
177 struct _cl_kernel { int dummy; };
178 struct _cl_event { int dummy; };
179 struct _cl_sampler { int dummy; };
180
181 typedef struct _cl_platform_id *    cl_platform_id;
182 typedef struct _cl_device_id *      cl_device_id;
183 typedef struct _cl_context *        cl_context;
184 typedef struct _cl_command_queue *  cl_command_queue;
185 typedef struct _cl_mem *            cl_mem;
186 typedef struct _cl_program *        cl_program;
187 typedef struct _cl_kernel *         cl_kernel;
188 typedef struct _cl_event *          cl_event;
189 typedef struct _cl_sampler *        cl_sampler;
190
191 typedef int cl_int;
192 typedef unsigned cl_uint;
193 #if defined (_WIN32) && defined(_MSC_VER)
194     typedef __int64 cl_long;
195     typedef unsigned __int64 cl_ulong;
196 #else
197     typedef long cl_long;
198     typedef unsigned long cl_ulong;
199 #endif
200
201 typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
202 typedef cl_ulong            cl_bitfield;
203 typedef cl_bitfield         cl_device_type;
204 typedef cl_uint             cl_platform_info;
205 typedef cl_uint             cl_device_info;
206 typedef cl_bitfield         cl_device_fp_config;
207 typedef cl_uint             cl_device_mem_cache_type;
208 typedef cl_uint             cl_device_local_mem_type;
209 typedef cl_bitfield         cl_device_exec_capabilities;
210 typedef cl_bitfield         cl_command_queue_properties;
211 typedef intptr_t            cl_device_partition_property;
212 typedef cl_bitfield         cl_device_affinity_domain;
213
214 typedef intptr_t            cl_context_properties;
215 typedef cl_uint             cl_context_info;
216 typedef cl_uint             cl_command_queue_info;
217 typedef cl_uint             cl_channel_order;
218 typedef cl_uint             cl_channel_type;
219 typedef cl_bitfield         cl_mem_flags;
220 typedef cl_uint             cl_mem_object_type;
221 typedef cl_uint             cl_mem_info;
222 typedef cl_bitfield         cl_mem_migration_flags;
223 typedef cl_uint             cl_image_info;
224 typedef cl_uint             cl_buffer_create_type;
225 typedef cl_uint             cl_addressing_mode;
226 typedef cl_uint             cl_filter_mode;
227 typedef cl_uint             cl_sampler_info;
228 typedef cl_bitfield         cl_map_flags;
229 typedef cl_uint             cl_program_info;
230 typedef cl_uint             cl_program_build_info;
231 typedef cl_uint             cl_program_binary_type;
232 typedef cl_int              cl_build_status;
233 typedef cl_uint             cl_kernel_info;
234 typedef cl_uint             cl_kernel_arg_info;
235 typedef cl_uint             cl_kernel_arg_address_qualifier;
236 typedef cl_uint             cl_kernel_arg_access_qualifier;
237 typedef cl_bitfield         cl_kernel_arg_type_qualifier;
238 typedef cl_uint             cl_kernel_work_group_info;
239 typedef cl_uint             cl_event_info;
240 typedef cl_uint             cl_command_type;
241 typedef cl_uint             cl_profiling_info;
242
243
244 typedef struct _cl_image_format {
245     cl_channel_order        image_channel_order;
246     cl_channel_type         image_channel_data_type;
247 } cl_image_format;
248
249 typedef struct _cl_image_desc {
250     cl_mem_object_type      image_type;
251     size_t                  image_width;
252     size_t                  image_height;
253     size_t                  image_depth;
254     size_t                  image_array_size;
255     size_t                  image_row_pitch;
256     size_t                  image_slice_pitch;
257     cl_uint                 num_mip_levels;
258     cl_uint                 num_samples;
259     cl_mem                  buffer;
260 } cl_image_desc;
261
262 typedef struct _cl_buffer_region {
263     size_t                  origin;
264     size_t                  size;
265 } cl_buffer_region;
266
267
268 //////////////////////////////////////////////////////////
269
270 #define CL_SUCCESS                                  0
271 #define CL_DEVICE_NOT_FOUND                         -1
272 #define CL_DEVICE_NOT_AVAILABLE                     -2
273 #define CL_COMPILER_NOT_AVAILABLE                   -3
274 #define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
275 #define CL_OUT_OF_RESOURCES                         -5
276 #define CL_OUT_OF_HOST_MEMORY                       -6
277 #define CL_PROFILING_INFO_NOT_AVAILABLE             -7
278 #define CL_MEM_COPY_OVERLAP                         -8
279 #define CL_IMAGE_FORMAT_MISMATCH                    -9
280 #define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
281 #define CL_BUILD_PROGRAM_FAILURE                    -11
282 #define CL_MAP_FAILURE                              -12
283 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
284 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
285 #define CL_COMPILE_PROGRAM_FAILURE                  -15
286 #define CL_LINKER_NOT_AVAILABLE                     -16
287 #define CL_LINK_PROGRAM_FAILURE                     -17
288 #define CL_DEVICE_PARTITION_FAILED                  -18
289 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
290
291 #define CL_INVALID_VALUE                            -30
292 #define CL_INVALID_DEVICE_TYPE                      -31
293 #define CL_INVALID_PLATFORM                         -32
294 #define CL_INVALID_DEVICE                           -33
295 #define CL_INVALID_CONTEXT                          -34
296 #define CL_INVALID_QUEUE_PROPERTIES                 -35
297 #define CL_INVALID_COMMAND_QUEUE                    -36
298 #define CL_INVALID_HOST_PTR                         -37
299 #define CL_INVALID_MEM_OBJECT                       -38
300 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
301 #define CL_INVALID_IMAGE_SIZE                       -40
302 #define CL_INVALID_SAMPLER                          -41
303 #define CL_INVALID_BINARY                           -42
304 #define CL_INVALID_BUILD_OPTIONS                    -43
305 #define CL_INVALID_PROGRAM                          -44
306 #define CL_INVALID_PROGRAM_EXECUTABLE               -45
307 #define CL_INVALID_KERNEL_NAME                      -46
308 #define CL_INVALID_KERNEL_DEFINITION                -47
309 #define CL_INVALID_KERNEL                           -48
310 #define CL_INVALID_ARG_INDEX                        -49
311 #define CL_INVALID_ARG_VALUE                        -50
312 #define CL_INVALID_ARG_SIZE                         -51
313 #define CL_INVALID_KERNEL_ARGS                      -52
314 #define CL_INVALID_WORK_DIMENSION                   -53
315 #define CL_INVALID_WORK_GROUP_SIZE                  -54
316 #define CL_INVALID_WORK_ITEM_SIZE                   -55
317 #define CL_INVALID_GLOBAL_OFFSET                    -56
318 #define CL_INVALID_EVENT_WAIT_LIST                  -57
319 #define CL_INVALID_EVENT                            -58
320 #define CL_INVALID_OPERATION                        -59
321 #define CL_INVALID_GL_OBJECT                        -60
322 #define CL_INVALID_BUFFER_SIZE                      -61
323 #define CL_INVALID_MIP_LEVEL                        -62
324 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
325 #define CL_INVALID_PROPERTY                         -64
326 #define CL_INVALID_IMAGE_DESCRIPTOR                 -65
327 #define CL_INVALID_COMPILER_OPTIONS                 -66
328 #define CL_INVALID_LINKER_OPTIONS                   -67
329 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
330
331 /*#define CL_VERSION_1_0                              1
332 #define CL_VERSION_1_1                              1
333 #define CL_VERSION_1_2                              1*/
334
335 #define CL_FALSE                                    0
336 #define CL_TRUE                                     1
337 #define CL_BLOCKING                                 CL_TRUE
338 #define CL_NON_BLOCKING                             CL_FALSE
339
340 #define CL_PLATFORM_PROFILE                         0x0900
341 #define CL_PLATFORM_VERSION                         0x0901
342 #define CL_PLATFORM_NAME                            0x0902
343 #define CL_PLATFORM_VENDOR                          0x0903
344 #define CL_PLATFORM_EXTENSIONS                      0x0904
345
346 #define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
347 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
348 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
349 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
350 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
351 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
352 #define CL_DEVICE_TYPE                              0x1000
353 #define CL_DEVICE_VENDOR_ID                         0x1001
354 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
355 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
356 #define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
357 #define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
358 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
359 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
360 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
361 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
362 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
363 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
364 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
365 #define CL_DEVICE_ADDRESS_BITS                      0x100D
366 #define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
367 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
368 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
369 #define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
370 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
371 #define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
372 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
373 #define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
374 #define CL_DEVICE_IMAGE_SUPPORT                     0x1016
375 #define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
376 #define CL_DEVICE_MAX_SAMPLERS                      0x1018
377 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
378 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
379 #define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
380 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
381 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
382 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
383 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
384 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
385 #define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
386 #define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
387 #define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
388 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
389 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
390 #define CL_DEVICE_ENDIAN_LITTLE                     0x1026
391 #define CL_DEVICE_AVAILABLE                         0x1027
392 #define CL_DEVICE_COMPILER_AVAILABLE                0x1028
393 #define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
394 #define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
395 #define CL_DEVICE_NAME                              0x102B
396 #define CL_DEVICE_VENDOR                            0x102C
397 #define CL_DRIVER_VERSION                           0x102D
398 #define CL_DEVICE_PROFILE                           0x102E
399 #define CL_DEVICE_VERSION                           0x102F
400 #define CL_DEVICE_EXTENSIONS                        0x1030
401 #define CL_DEVICE_PLATFORM                          0x1031
402 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
403 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
404 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
405 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
406 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
407 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
408 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
409 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
410 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
411 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
412 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
413 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
414 #define CL_DEVICE_LINKER_AVAILABLE                  0x103E
415 #define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
416 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
417 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
418 #define CL_DEVICE_PARENT_DEVICE                     0x1042
419 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
420 #define CL_DEVICE_PARTITION_PROPERTIES              0x1044
421 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
422 #define CL_DEVICE_PARTITION_TYPE                    0x1046
423 #define CL_DEVICE_REFERENCE_COUNT                   0x1047
424 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
425 #define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
426 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
427 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
428
429 #define CL_FP_DENORM                                (1 << 0)
430 #define CL_FP_INF_NAN                               (1 << 1)
431 #define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
432 #define CL_FP_ROUND_TO_ZERO                         (1 << 3)
433 #define CL_FP_ROUND_TO_INF                          (1 << 4)
434 #define CL_FP_FMA                                   (1 << 5)
435 #define CL_FP_SOFT_FLOAT                            (1 << 6)
436 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
437
438 #define CL_NONE                                     0x0
439 #define CL_READ_ONLY_CACHE                          0x1
440 #define CL_READ_WRITE_CACHE                         0x2
441 #define CL_LOCAL                                    0x1
442 #define CL_GLOBAL                                   0x2
443 #define CL_EXEC_KERNEL                              (1 << 0)
444 #define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
445 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
446 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
447
448 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
449 #define CL_CONTEXT_DEVICES                          0x1081
450 #define CL_CONTEXT_PROPERTIES                       0x1082
451 #define CL_CONTEXT_NUM_DEVICES                      0x1083
452 #define CL_CONTEXT_PLATFORM                         0x1084
453 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
454
455 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
456 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
457 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
458 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
459 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
460 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
461 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
462 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
463 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
464 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
465 #define CL_QUEUE_CONTEXT                            0x1090
466 #define CL_QUEUE_DEVICE                             0x1091
467 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
468 #define CL_QUEUE_PROPERTIES                         0x1093
469 #define CL_MEM_READ_WRITE                           (1 << 0)
470 #define CL_MEM_WRITE_ONLY                           (1 << 1)
471 #define CL_MEM_READ_ONLY                            (1 << 2)
472 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
473 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
474 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
475 // reserved                                         (1 << 6)
476 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
477 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
478 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
479 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
480 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
481
482 #define CL_R                                        0x10B0
483 #define CL_A                                        0x10B1
484 #define CL_RG                                       0x10B2
485 #define CL_RA                                       0x10B3
486 #define CL_RGB                                      0x10B4
487 #define CL_RGBA                                     0x10B5
488 #define CL_BGRA                                     0x10B6
489 #define CL_ARGB                                     0x10B7
490 #define CL_INTENSITY                                0x10B8
491 #define CL_LUMINANCE                                0x10B9
492 #define CL_Rx                                       0x10BA
493 #define CL_RGx                                      0x10BB
494 #define CL_RGBx                                     0x10BC
495 #define CL_DEPTH                                    0x10BD
496 #define CL_DEPTH_STENCIL                            0x10BE
497
498 #define CL_SNORM_INT8                               0x10D0
499 #define CL_SNORM_INT16                              0x10D1
500 #define CL_UNORM_INT8                               0x10D2
501 #define CL_UNORM_INT16                              0x10D3
502 #define CL_UNORM_SHORT_565                          0x10D4
503 #define CL_UNORM_SHORT_555                          0x10D5
504 #define CL_UNORM_INT_101010                         0x10D6
505 #define CL_SIGNED_INT8                              0x10D7
506 #define CL_SIGNED_INT16                             0x10D8
507 #define CL_SIGNED_INT32                             0x10D9
508 #define CL_UNSIGNED_INT8                            0x10DA
509 #define CL_UNSIGNED_INT16                           0x10DB
510 #define CL_UNSIGNED_INT32                           0x10DC
511 #define CL_HALF_FLOAT                               0x10DD
512 #define CL_FLOAT                                    0x10DE
513 #define CL_UNORM_INT24                              0x10DF
514
515 #define CL_MEM_OBJECT_BUFFER                        0x10F0
516 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
517 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
518 #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
519 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
520 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
521 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
522
523 #define CL_MEM_TYPE                                 0x1100
524 #define CL_MEM_FLAGS                                0x1101
525 #define CL_MEM_SIZE                                 0x1102
526 #define CL_MEM_HOST_PTR                             0x1103
527 #define CL_MEM_MAP_COUNT                            0x1104
528 #define CL_MEM_REFERENCE_COUNT                      0x1105
529 #define CL_MEM_CONTEXT                              0x1106
530 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
531 #define CL_MEM_OFFSET                               0x1108
532
533 #define CL_IMAGE_FORMAT                             0x1110
534 #define CL_IMAGE_ELEMENT_SIZE                       0x1111
535 #define CL_IMAGE_ROW_PITCH                          0x1112
536 #define CL_IMAGE_SLICE_PITCH                        0x1113
537 #define CL_IMAGE_WIDTH                              0x1114
538 #define CL_IMAGE_HEIGHT                             0x1115
539 #define CL_IMAGE_DEPTH                              0x1116
540 #define CL_IMAGE_ARRAY_SIZE                         0x1117
541 #define CL_IMAGE_BUFFER                             0x1118
542 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
543 #define CL_IMAGE_NUM_SAMPLES                        0x111A
544
545 #define CL_ADDRESS_NONE                             0x1130
546 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
547 #define CL_ADDRESS_CLAMP                            0x1132
548 #define CL_ADDRESS_REPEAT                           0x1133
549 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
550
551 #define CL_FILTER_NEAREST                           0x1140
552 #define CL_FILTER_LINEAR                            0x1141
553
554 #define CL_SAMPLER_REFERENCE_COUNT                  0x1150
555 #define CL_SAMPLER_CONTEXT                          0x1151
556 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
557 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
558 #define CL_SAMPLER_FILTER_MODE                      0x1154
559
560 #define CL_MAP_READ                                 (1 << 0)
561 #define CL_MAP_WRITE                                (1 << 1)
562 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
563
564 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
565 #define CL_PROGRAM_CONTEXT                          0x1161
566 #define CL_PROGRAM_NUM_DEVICES                      0x1162
567 #define CL_PROGRAM_DEVICES                          0x1163
568 #define CL_PROGRAM_SOURCE                           0x1164
569 #define CL_PROGRAM_BINARY_SIZES                     0x1165
570 #define CL_PROGRAM_BINARIES                         0x1166
571 #define CL_PROGRAM_NUM_KERNELS                      0x1167
572 #define CL_PROGRAM_KERNEL_NAMES                     0x1168
573 #define CL_PROGRAM_BUILD_STATUS                     0x1181
574 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
575 #define CL_PROGRAM_BUILD_LOG                        0x1183
576 #define CL_PROGRAM_BINARY_TYPE                      0x1184
577 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
578 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
579 #define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
580 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
581
582 #define CL_BUILD_SUCCESS                            0
583 #define CL_BUILD_NONE                               -1
584 #define CL_BUILD_ERROR                              -2
585 #define CL_BUILD_IN_PROGRESS                        -3
586
587 #define CL_KERNEL_FUNCTION_NAME                     0x1190
588 #define CL_KERNEL_NUM_ARGS                          0x1191
589 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
590 #define CL_KERNEL_CONTEXT                           0x1193
591 #define CL_KERNEL_PROGRAM                           0x1194
592 #define CL_KERNEL_ATTRIBUTES                        0x1195
593 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
594 #define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
595 #define CL_KERNEL_ARG_TYPE_NAME                     0x1198
596 #define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
597 #define CL_KERNEL_ARG_NAME                          0x119A
598 #define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
599 #define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
600 #define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
601 #define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
602 #define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
603 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
604 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
605 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
606 #define CL_KERNEL_ARG_TYPE_NONE                     0
607 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
608 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
609 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
610 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
611 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
612 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
613 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
614 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
615 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
616
617 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
618 #define CL_EVENT_COMMAND_TYPE                       0x11D1
619 #define CL_EVENT_REFERENCE_COUNT                    0x11D2
620 #define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
621 #define CL_EVENT_CONTEXT                            0x11D4
622
623 #define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
624 #define CL_COMMAND_TASK                             0x11F1
625 #define CL_COMMAND_NATIVE_KERNEL                    0x11F2
626 #define CL_COMMAND_READ_BUFFER                      0x11F3
627 #define CL_COMMAND_WRITE_BUFFER                     0x11F4
628 #define CL_COMMAND_COPY_BUFFER                      0x11F5
629 #define CL_COMMAND_READ_IMAGE                       0x11F6
630 #define CL_COMMAND_WRITE_IMAGE                      0x11F7
631 #define CL_COMMAND_COPY_IMAGE                       0x11F8
632 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
633 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
634 #define CL_COMMAND_MAP_BUFFER                       0x11FB
635 #define CL_COMMAND_MAP_IMAGE                        0x11FC
636 #define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
637 #define CL_COMMAND_MARKER                           0x11FE
638 #define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
639 #define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
640 #define CL_COMMAND_READ_BUFFER_RECT                 0x1201
641 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
642 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
643 #define CL_COMMAND_USER                             0x1204
644 #define CL_COMMAND_BARRIER                          0x1205
645 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
646 #define CL_COMMAND_FILL_BUFFER                      0x1207
647 #define CL_COMMAND_FILL_IMAGE                       0x1208
648
649 #define CL_COMPLETE                                 0x0
650 #define CL_RUNNING                                  0x1
651 #define CL_SUBMITTED                                0x2
652 #define CL_QUEUED                                   0x3
653 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
654
655 #define CL_PROFILING_COMMAND_QUEUED                 0x1280
656 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
657 #define CL_PROFILING_COMMAND_START                  0x1282
658 #define CL_PROFILING_COMMAND_END                    0x1283
659
660 #define CL_CALLBACK CV_STDCALL
661
662 static volatile bool g_haveOpenCL = false;
663 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
664
665 #if defined(__APPLE__)
666 #include <dlfcn.h>
667
668 static void* initOpenCLAndLoad(const char* funcname)
669 {
670     static bool initialized = false;
671     static void* handle = 0;
672     if (!handle)
673     {
674         if(!initialized)
675         {
676             const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
677             oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
678                 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
679             handle = dlopen(oclpath, RTLD_LAZY);
680             initialized = true;
681             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
682             if( g_haveOpenCL )
683                 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
684             else
685                 fprintf(stderr, "Failed to load OpenCL runtime\n");
686         }
687         if(!handle)
688             return 0;
689     }
690
691     return funcname && handle ? dlsym(handle, funcname) : 0;
692 }
693
694 #elif defined WIN32 || defined _WIN32
695
696 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
697   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
698 #endif
699 #include <windows.h>
700 #if (_WIN32_WINNT >= 0x0602)
701   #include <synchapi.h>
702 #endif
703 #undef small
704 #undef min
705 #undef max
706 #undef abs
707
708 static void* initOpenCLAndLoad(const char* funcname)
709 {
710     static bool initialized = false;
711     static HMODULE handle = 0;
712     if (!handle)
713     {
714 #ifndef HAVE_WINRT
715         if(!initialized)
716         {
717             handle = LoadLibraryA("OpenCL.dll");
718             initialized = true;
719             g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
720         }
721 #endif
722         if(!handle)
723             return 0;
724     }
725
726     return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
727 }
728
729 #elif defined(__linux)
730
731 #include <dlfcn.h>
732 #include <stdio.h>
733
734 static void* initOpenCLAndLoad(const char* funcname)
735 {
736     static bool initialized = false;
737     static void* handle = 0;
738     if (!handle)
739     {
740         if(!initialized)
741         {
742             handle = dlopen("libOpenCL.so", RTLD_LAZY);
743             if(!handle)
744                 handle = dlopen("libCL.so", RTLD_LAZY);
745             initialized = true;
746             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
747         }
748         if(!handle)
749             return 0;
750     }
751
752     return funcname ? (void*)dlsym(handle, funcname) : 0;
753 }
754
755 #else
756
757 static void* initOpenCLAndLoad(const char*)
758 {
759     return 0;
760 }
761
762 #endif
763
764
765 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
766     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
767     static rettype funcname argsdecl \
768     { \
769         static funcname##_t funcname##_p = 0; \
770         if( !funcname##_p ) \
771         { \
772             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
773             if( !funcname##_p ) \
774                 return OPENCV_CL_NOT_IMPLEMENTED; \
775         } \
776         return funcname##_p args; \
777     }
778
779
780 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
781     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
782     static rettype funcname argsdecl \
783     { \
784         static funcname##_t funcname##_p = 0; \
785         if( !funcname##_p ) \
786         { \
787             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
788             if( !funcname##_p ) \
789             { \
790                 if( errcode_ret ) \
791                     *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
792                 return 0; \
793             } \
794         } \
795         return funcname##_p args; \
796     }
797
798 OCL_FUNC(cl_int, clGetPlatformIDs,
799     (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
800     (num_entries, platforms, num_platforms))
801
802 OCL_FUNC(cl_int, clGetPlatformInfo,
803     (cl_platform_id platform, cl_platform_info param_name,
804     size_t param_value_size, void * param_value,
805     size_t * param_value_size_ret),
806     (platform, param_name, param_value_size, param_value, param_value_size_ret))
807
808 OCL_FUNC(cl_int, clGetDeviceInfo,
809          (cl_device_id device,
810           cl_device_info param_name,
811           size_t param_value_size,
812           void * param_value,
813           size_t * param_value_size_ret),
814          (device, param_name, param_value_size, param_value, param_value_size_ret))
815
816
817 OCL_FUNC(cl_int, clGetDeviceIDs,
818     (cl_platform_id platform,
819     cl_device_type device_type,
820     cl_uint num_entries,
821     cl_device_id * devices,
822     cl_uint * num_devices),
823     (platform, device_type, num_entries, devices, num_devices))
824
825 OCL_FUNC_P(cl_context, clCreateContext,
826     (const cl_context_properties * properties,
827     cl_uint num_devices,
828     const cl_device_id * devices,
829     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
830     void * user_data,
831     cl_int * errcode_ret),
832     (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
833
834 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
835
836 /*
837 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
838
839 OCL_FUNC_P(cl_context, clCreateContextFromType,
840     (const cl_context_properties * properties,
841     cl_device_type device_type,
842     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
843     void * user_data,
844     cl_int * errcode_ret),
845     (properties, device_type, pfn_notify, user_data, errcode_ret))
846
847 OCL_FUNC(cl_int, clGetContextInfo,
848     (cl_context context,
849     cl_context_info param_name,
850     size_t param_value_size,
851     void * param_value,
852     size_t * param_value_size_ret),
853     (context, param_name, param_value_size,
854     param_value, param_value_size_ret))
855 */
856 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
857     (cl_context context,
858     cl_device_id device,
859     cl_command_queue_properties properties,
860     cl_int * errcode_ret),
861     (context, device, properties, errcode_ret))
862
863 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
864
865 OCL_FUNC_P(cl_mem, clCreateBuffer,
866     (cl_context context,
867     cl_mem_flags flags,
868     size_t size,
869     void * host_ptr,
870     cl_int * errcode_ret),
871     (context, flags, size, host_ptr, errcode_ret))
872
873 /*
874 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
875
876 OCL_FUNC(cl_int, clGetCommandQueueInfo,
877  (cl_command_queue command_queue,
878  cl_command_queue_info param_name,
879  size_t param_value_size,
880  void * param_value,
881  size_t * param_value_size_ret),
882  (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
883
884 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
885     (cl_mem buffer,
886     cl_mem_flags flags,
887     cl_buffer_create_type buffer_create_type,
888     const void * buffer_create_info,
889     cl_int * errcode_ret),
890     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
891 */
892
893 OCL_FUNC_P(cl_mem, clCreateImage,
894     (cl_context context,
895     cl_mem_flags flags,
896     const cl_image_format * image_format,
897     const cl_image_desc * image_desc,
898     void * host_ptr,
899     cl_int * errcode_ret),
900     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
901
902 OCL_FUNC_P(cl_mem, clCreateImage2D,
903     (cl_context context,
904     cl_mem_flags flags,
905     const cl_image_format * image_format,
906     size_t image_width,
907     size_t image_height,
908     size_t image_row_pitch,
909     void * host_ptr,
910     cl_int *errcode_ret),
911     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
912
913 OCL_FUNC(cl_int, clGetSupportedImageFormats,
914  (cl_context context,
915  cl_mem_flags flags,
916  cl_mem_object_type image_type,
917  cl_uint num_entries,
918  cl_image_format * image_formats,
919  cl_uint * num_image_formats),
920  (context, flags, image_type, num_entries, image_formats, num_image_formats))
921
922 /*
923 OCL_FUNC(cl_int, clGetMemObjectInfo,
924  (cl_mem memobj,
925  cl_mem_info param_name,
926  size_t param_value_size,
927  void * param_value,
928  size_t * param_value_size_ret),
929  (memobj, param_name, param_value_size, param_value, param_value_size_ret))
930
931 OCL_FUNC(cl_int, clGetImageInfo,
932  (cl_mem image,
933  cl_image_info param_name,
934  size_t param_value_size,
935  void * param_value,
936  size_t * param_value_size_ret),
937  (image, param_name, param_value_size, param_value, param_value_size_ret))
938
939 OCL_FUNC(cl_int, clCreateKernelsInProgram,
940  (cl_program program,
941  cl_uint num_kernels,
942  cl_kernel * kernels,
943  cl_uint * num_kernels_ret),
944  (program, num_kernels, kernels, num_kernels_ret))
945
946 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
947
948 OCL_FUNC(cl_int, clGetKernelArgInfo,
949  (cl_kernel kernel,
950  cl_uint arg_indx,
951  cl_kernel_arg_info param_name,
952  size_t param_value_size,
953  void * param_value,
954  size_t * param_value_size_ret),
955  (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
956
957 OCL_FUNC(cl_int, clEnqueueReadImage,
958  (cl_command_queue command_queue,
959  cl_mem image,
960  cl_bool blocking_read,
961  const size_t * origin[3],
962  const size_t * region[3],
963  size_t row_pitch,
964  size_t slice_pitch,
965  void * ptr,
966  cl_uint num_events_in_wait_list,
967  const cl_event * event_wait_list,
968  cl_event * event),
969  (command_queue, image, blocking_read, origin, region,
970  row_pitch, slice_pitch,
971  ptr,
972  num_events_in_wait_list,
973  event_wait_list,
974  event))
975
976 OCL_FUNC(cl_int, clEnqueueWriteImage,
977  (cl_command_queue command_queue,
978  cl_mem image,
979  cl_bool blocking_write,
980  const size_t * origin[3],
981  const size_t * region[3],
982  size_t input_row_pitch,
983  size_t input_slice_pitch,
984  const void * ptr,
985  cl_uint num_events_in_wait_list,
986  const cl_event * event_wait_list,
987  cl_event * event),
988  (command_queue, image, blocking_write, origin, region, input_row_pitch,
989  input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
990
991 OCL_FUNC(cl_int, clEnqueueFillImage,
992  (cl_command_queue command_queue,
993  cl_mem image,
994  const void * fill_color,
995  const size_t * origin[3],
996  const size_t * region[3],
997  cl_uint num_events_in_wait_list,
998  const cl_event * event_wait_list,
999  cl_event * event),
1000  (command_queue, image, fill_color, origin, region,
1001  num_events_in_wait_list, event_wait_list, event))
1002
1003 OCL_FUNC(cl_int, clEnqueueCopyImage,
1004  (cl_command_queue command_queue,
1005  cl_mem src_image,
1006  cl_mem dst_image,
1007  const size_t * src_origin[3],
1008  const size_t * dst_origin[3],
1009  const size_t * region[3],
1010  cl_uint num_events_in_wait_list,
1011  const cl_event * event_wait_list,
1012  cl_event * event),
1013  (command_queue, src_image, dst_image, src_origin, dst_origin,
1014  region, num_events_in_wait_list, event_wait_list, event))
1015
1016 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
1017  (cl_command_queue command_queue,
1018  cl_mem src_image,
1019  cl_mem dst_buffer,
1020  const size_t * src_origin[3],
1021  const size_t * region[3],
1022  size_t dst_offset,
1023  cl_uint num_events_in_wait_list,
1024  const cl_event * event_wait_list,
1025  cl_event * event),
1026  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1027  num_events_in_wait_list, event_wait_list, event))
1028 */
1029
1030 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1031  (cl_command_queue command_queue,
1032  cl_mem src_buffer,
1033  cl_mem dst_image,
1034  size_t src_offset,
1035  const size_t dst_origin[3],
1036  const size_t region[3],
1037  cl_uint num_events_in_wait_list,
1038  const cl_event * event_wait_list,
1039  cl_event * event),
1040  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1041  region, num_events_in_wait_list, event_wait_list, event))
1042
1043  OCL_FUNC(cl_int, clFlush,
1044  (cl_command_queue command_queue),
1045  (command_queue))
1046
1047 /*
1048 OCL_FUNC_P(void*, clEnqueueMapImage,
1049  (cl_command_queue command_queue,
1050  cl_mem image,
1051  cl_bool blocking_map,
1052  cl_map_flags map_flags,
1053  const size_t * origin[3],
1054  const size_t * region[3],
1055  size_t * image_row_pitch,
1056  size_t * image_slice_pitch,
1057  cl_uint num_events_in_wait_list,
1058  const cl_event * event_wait_list,
1059  cl_event * event,
1060  cl_int * errcode_ret),
1061  (command_queue, image, blocking_map, map_flags, origin, region,
1062  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1063  event_wait_list, event, errcode_ret))
1064 */
1065
1066 /*
1067 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1068
1069 OCL_FUNC(cl_int, clGetKernelInfo,
1070  (cl_kernel kernel,
1071  cl_kernel_info param_name,
1072  size_t param_value_size,
1073  void * param_value,
1074  size_t * param_value_size_ret),
1075  (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1076
1077 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1078
1079 */
1080
1081 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1082
1083
1084 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1085     (cl_context context,
1086     cl_uint count,
1087     const char ** strings,
1088     const size_t * lengths,
1089     cl_int * errcode_ret),
1090     (context, count, strings, lengths, errcode_ret))
1091
1092 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1093     (cl_context context,
1094     cl_uint num_devices,
1095     const cl_device_id * device_list,
1096     const size_t * lengths,
1097     const unsigned char ** binaries,
1098     cl_int * binary_status,
1099     cl_int * errcode_ret),
1100     (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1101
1102 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1103
1104 OCL_FUNC(cl_int, clBuildProgram,
1105     (cl_program program,
1106     cl_uint num_devices,
1107     const cl_device_id * device_list,
1108     const char * options,
1109     void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1110     void * user_data),
1111     (program, num_devices, device_list, options, pfn_notify, user_data))
1112
1113 OCL_FUNC(cl_int, clGetProgramInfo,
1114     (cl_program program,
1115     cl_program_info param_name,
1116     size_t param_value_size,
1117     void * param_value,
1118     size_t * param_value_size_ret),
1119     (program, param_name, param_value_size, param_value, param_value_size_ret))
1120
1121 OCL_FUNC(cl_int, clGetProgramBuildInfo,
1122     (cl_program program,
1123     cl_device_id device,
1124     cl_program_build_info param_name,
1125     size_t param_value_size,
1126     void * param_value,
1127     size_t * param_value_size_ret),
1128     (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1129
1130 OCL_FUNC_P(cl_kernel, clCreateKernel,
1131     (cl_program program,
1132     const char * kernel_name,
1133     cl_int * errcode_ret),
1134     (program, kernel_name, errcode_ret))
1135
1136 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1137
1138 OCL_FUNC(cl_int, clSetKernelArg,
1139     (cl_kernel kernel,
1140     cl_uint arg_index,
1141     size_t arg_size,
1142     const void * arg_value),
1143     (kernel, arg_index, arg_size, arg_value))
1144
1145 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1146     (cl_kernel kernel,
1147     cl_device_id device,
1148     cl_kernel_work_group_info param_name,
1149     size_t param_value_size,
1150     void * param_value,
1151     size_t * param_value_size_ret),
1152     (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1153
1154 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1155
1156 OCL_FUNC(cl_int, clEnqueueReadBuffer,
1157     (cl_command_queue command_queue,
1158     cl_mem buffer,
1159     cl_bool blocking_read,
1160     size_t offset,
1161     size_t size,
1162     void * ptr,
1163     cl_uint num_events_in_wait_list,
1164     const cl_event * event_wait_list,
1165     cl_event * event),
1166     (command_queue, buffer, blocking_read, offset, size, ptr,
1167     num_events_in_wait_list, event_wait_list, event))
1168
1169 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1170     (cl_command_queue command_queue,
1171     cl_mem buffer,
1172     cl_bool blocking_read,
1173     const size_t * buffer_offset,
1174     const size_t * host_offset,
1175     const size_t * region,
1176     size_t buffer_row_pitch,
1177     size_t buffer_slice_pitch,
1178     size_t host_row_pitch,
1179     size_t host_slice_pitch,
1180     void * ptr,
1181     cl_uint num_events_in_wait_list,
1182     const cl_event * event_wait_list,
1183     cl_event * event),
1184     (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1185     buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1186     event_wait_list, event))
1187
1188 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1189     (cl_command_queue command_queue,
1190     cl_mem buffer,
1191     cl_bool blocking_write,
1192     size_t offset,
1193     size_t size,
1194     const void * ptr,
1195     cl_uint num_events_in_wait_list,
1196     const cl_event * event_wait_list,
1197     cl_event * event),
1198     (command_queue, buffer, blocking_write, offset, size, ptr,
1199     num_events_in_wait_list, event_wait_list, event))
1200
1201 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1202     (cl_command_queue command_queue,
1203     cl_mem buffer,
1204     cl_bool blocking_write,
1205     const size_t * buffer_offset,
1206     const size_t * host_offset,
1207     const size_t * region,
1208     size_t buffer_row_pitch,
1209     size_t buffer_slice_pitch,
1210     size_t host_row_pitch,
1211     size_t host_slice_pitch,
1212     const void * ptr,
1213     cl_uint num_events_in_wait_list,
1214     const cl_event * event_wait_list,
1215     cl_event * event),
1216     (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1217     region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1218     host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1219
1220 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1221     (cl_command_queue command_queue,
1222     cl_mem buffer,
1223     const void * pattern,
1224     size_t pattern_size,
1225     size_t offset,
1226     size_t size,
1227     cl_uint num_events_in_wait_list,
1228     const cl_event * event_wait_list,
1229     cl_event * event),
1230     (command_queue, buffer, pattern, pattern_size, offset, size,
1231     num_events_in_wait_list, event_wait_list, event))*/
1232
1233 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1234     (cl_command_queue command_queue,
1235     cl_mem src_buffer,
1236     cl_mem dst_buffer,
1237     size_t src_offset,
1238     size_t dst_offset,
1239     size_t size,
1240     cl_uint num_events_in_wait_list,
1241     const cl_event * event_wait_list,
1242     cl_event * event),
1243     (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1244     size, num_events_in_wait_list, event_wait_list, event))
1245
1246 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1247     (cl_command_queue command_queue,
1248     cl_mem src_buffer,
1249     cl_mem dst_buffer,
1250     const size_t * src_origin,
1251     const size_t * dst_origin,
1252     const size_t * region,
1253     size_t src_row_pitch,
1254     size_t src_slice_pitch,
1255     size_t dst_row_pitch,
1256     size_t dst_slice_pitch,
1257     cl_uint num_events_in_wait_list,
1258     const cl_event * event_wait_list,
1259     cl_event * event),
1260     (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1261     region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1262     num_events_in_wait_list, event_wait_list, event))
1263
1264 OCL_FUNC_P(void*, clEnqueueMapBuffer,
1265     (cl_command_queue command_queue,
1266     cl_mem buffer,
1267     cl_bool blocking_map,
1268     cl_map_flags map_flags,
1269     size_t offset,
1270     size_t size,
1271     cl_uint num_events_in_wait_list,
1272     const cl_event * event_wait_list,
1273     cl_event * event,
1274     cl_int * errcode_ret),
1275     (command_queue, buffer, blocking_map, map_flags, offset, size,
1276     num_events_in_wait_list, event_wait_list, event, errcode_ret))
1277
1278 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1279     (cl_command_queue command_queue,
1280     cl_mem memobj,
1281     void * mapped_ptr,
1282     cl_uint num_events_in_wait_list,
1283     const cl_event * event_wait_list,
1284     cl_event * event),
1285     (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1286
1287 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1288     (cl_command_queue command_queue,
1289     cl_kernel kernel,
1290     cl_uint work_dim,
1291     const size_t * global_work_offset,
1292     const size_t * global_work_size,
1293     const size_t * local_work_size,
1294     cl_uint num_events_in_wait_list,
1295     const cl_event * event_wait_list,
1296     cl_event * event),
1297     (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1298     local_work_size, num_events_in_wait_list, event_wait_list, event))
1299
1300 OCL_FUNC(cl_int, clEnqueueTask,
1301     (cl_command_queue command_queue,
1302     cl_kernel kernel,
1303     cl_uint num_events_in_wait_list,
1304     const cl_event * event_wait_list,
1305     cl_event * event),
1306     (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1307
1308 OCL_FUNC(cl_int, clSetEventCallback,
1309     (cl_event event,
1310     cl_int command_exec_callback_type ,
1311     void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1312     void *user_data),
1313     (event, command_exec_callback_type, pfn_event_notify, user_data))
1314
1315 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1316
1317 }
1318
1319 #endif
1320
1321 #ifndef CL_VERSION_1_2
1322 #define CL_VERSION_1_2
1323 #endif
1324
1325 #endif
1326
1327 #ifdef _DEBUG
1328 #define CV_OclDbgAssert CV_DbgAssert
1329 #else
1330 static bool isRaiseError()
1331 {
1332     static bool initialized = false;
1333     static bool value = false;
1334     if (!initialized)
1335     {
1336         value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
1337         initialized = true;
1338     }
1339     return value;
1340 }
1341 #define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
1342 #endif
1343
1344 namespace cv { namespace ocl {
1345
1346 struct UMat2D
1347 {
1348     UMat2D(const UMat& m)
1349     {
1350         offset = (int)m.offset;
1351         step = (int)m.step;
1352         rows = m.rows;
1353         cols = m.cols;
1354     }
1355     int offset;
1356     int step;
1357     int rows;
1358     int cols;
1359 };
1360
1361 struct UMat3D
1362 {
1363     UMat3D(const UMat& m)
1364     {
1365         offset = (int)m.offset;
1366         step = (int)m.step.p[1];
1367         slicestep = (int)m.step.p[0];
1368         slices = (int)m.size.p[0];
1369         rows = m.size.p[1];
1370         cols = m.size.p[2];
1371     }
1372     int offset;
1373     int slicestep;
1374     int step;
1375     int slices;
1376     int rows;
1377     int cols;
1378 };
1379
1380 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
1381 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1382 {
1383     static uint64 table[256];
1384     static bool initialized = false;
1385
1386     if( !initialized )
1387     {
1388         for( int i = 0; i < 256; i++ )
1389         {
1390             uint64 c = i;
1391             for( int j = 0; j < 8; j++ )
1392                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1393             table[i] = c;
1394         }
1395         initialized = true;
1396     }
1397
1398     uint64 crc = ~crc0;
1399     for( size_t idx = 0; idx < size; idx++ )
1400         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1401
1402     return ~crc;
1403 }
1404
1405 struct HashKey
1406 {
1407     typedef uint64 part;
1408     HashKey(part _a, part _b) : a(_a), b(_b) {}
1409     part a, b;
1410 };
1411
1412 inline bool operator == (const HashKey& h1, const HashKey& h2)
1413 {
1414     return h1.a == h2.a && h1.b == h2.b;
1415 }
1416
1417 inline bool operator < (const HashKey& h1, const HashKey& h2)
1418 {
1419     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1420 }
1421
1422
1423 bool haveOpenCL()
1424 {
1425 #ifdef HAVE_OPENCL
1426     static bool g_isOpenCLInitialized = false;
1427     static bool g_isOpenCLAvailable = false;
1428
1429     if (!g_isOpenCLInitialized)
1430     {
1431         try
1432         {
1433             cl_uint n = 0;
1434             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1435         }
1436         catch (...)
1437         {
1438             g_isOpenCLAvailable = false;
1439         }
1440         g_isOpenCLInitialized = true;
1441     }
1442     return g_isOpenCLAvailable;
1443 #else
1444     return false;
1445 #endif
1446 }
1447
1448 bool useOpenCL()
1449 {
1450     CoreTLSData* data = coreTlsData.get();
1451     if( data->useOpenCL < 0 )
1452     {
1453         try
1454         {
1455             data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
1456         }
1457         catch (...)
1458         {
1459             data->useOpenCL = 0;
1460         }
1461     }
1462     return data->useOpenCL > 0;
1463 }
1464
1465 void setUseOpenCL(bool flag)
1466 {
1467     if( haveOpenCL() )
1468     {
1469         CoreTLSData* data = coreTlsData.get();
1470         data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1471     }
1472 }
1473
1474 #ifdef HAVE_CLAMDBLAS
1475
1476 class AmdBlasHelper
1477 {
1478 public:
1479     static AmdBlasHelper & getInstance()
1480     {
1481         static AmdBlasHelper amdBlas;
1482         return amdBlas;
1483     }
1484
1485     bool isAvailable() const
1486     {
1487         return g_isAmdBlasAvailable;
1488     }
1489
1490     ~AmdBlasHelper()
1491     {
1492         try
1493         {
1494             clAmdBlasTeardown();
1495         }
1496         catch (...) { }
1497     }
1498
1499 protected:
1500     AmdBlasHelper()
1501     {
1502         if (!g_isAmdBlasInitialized)
1503         {
1504             AutoLock lock(m);
1505
1506             if (!g_isAmdBlasInitialized && haveOpenCL())
1507             {
1508                 try
1509                 {
1510                     g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1511                 }
1512                 catch (...)
1513                 {
1514                     g_isAmdBlasAvailable = false;
1515                 }
1516             }
1517             else
1518                 g_isAmdBlasAvailable = false;
1519
1520             g_isAmdBlasInitialized = true;
1521         }
1522     }
1523
1524 private:
1525     static Mutex m;
1526     static bool g_isAmdBlasInitialized;
1527     static bool g_isAmdBlasAvailable;
1528 };
1529
1530 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1531 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1532 Mutex AmdBlasHelper::m;
1533
1534 bool haveAmdBlas()
1535 {
1536     return AmdBlasHelper::getInstance().isAvailable();
1537 }
1538
1539 #else
1540
1541 bool haveAmdBlas()
1542 {
1543     return false;
1544 }
1545
1546 #endif
1547
1548 #ifdef HAVE_CLAMDFFT
1549
1550 class AmdFftHelper
1551 {
1552 public:
1553     static AmdFftHelper & getInstance()
1554     {
1555         static AmdFftHelper amdFft;
1556         return amdFft;
1557     }
1558
1559     bool isAvailable() const
1560     {
1561         return g_isAmdFftAvailable;
1562     }
1563
1564     ~AmdFftHelper()
1565     {
1566         try
1567         {
1568 //            clAmdFftTeardown();
1569         }
1570         catch (...) { }
1571     }
1572
1573 protected:
1574     AmdFftHelper()
1575     {
1576         if (!g_isAmdFftInitialized)
1577         {
1578             AutoLock lock(m);
1579
1580             if (!g_isAmdFftInitialized && haveOpenCL())
1581             {
1582                 try
1583                 {
1584                     CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1585                     g_isAmdFftAvailable = true;
1586                 }
1587                 catch (const Exception &)
1588                 {
1589                     g_isAmdFftAvailable = false;
1590                 }
1591             }
1592             else
1593                 g_isAmdFftAvailable = false;
1594
1595             g_isAmdFftInitialized = true;
1596         }
1597     }
1598
1599 private:
1600     static clAmdFftSetupData setupData;
1601     static Mutex m;
1602     static bool g_isAmdFftInitialized;
1603     static bool g_isAmdFftAvailable;
1604 };
1605
1606 clAmdFftSetupData AmdFftHelper::setupData;
1607 bool AmdFftHelper::g_isAmdFftAvailable = false;
1608 bool AmdFftHelper::g_isAmdFftInitialized = false;
1609 Mutex AmdFftHelper::m;
1610
1611 bool haveAmdFft()
1612 {
1613     return AmdFftHelper::getInstance().isAvailable();
1614 }
1615
1616 #else
1617
1618 bool haveAmdFft()
1619 {
1620     return false;
1621 }
1622
1623 #endif
1624
1625 void finish()
1626 {
1627     Queue::getDefault().finish();
1628 }
1629
1630 #define IMPLEMENT_REFCOUNTABLE() \
1631     void addref() { CV_XADD(&refcount, 1); } \
1632     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1633     int refcount
1634
1635 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1636
1637 struct Platform::Impl
1638 {
1639     Impl()
1640     {
1641         refcount = 1;
1642         handle = 0;
1643         initialized = false;
1644     }
1645
1646     ~Impl() {}
1647
1648     void init()
1649     {
1650         if( !initialized )
1651         {
1652             //cl_uint num_entries
1653             cl_uint n = 0;
1654             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1655                 handle = 0;
1656             if( handle != 0 )
1657             {
1658                 char buf[1000];
1659                 size_t len = 0;
1660                 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1661                 buf[len] = '\0';
1662                 vendor = String(buf);
1663             }
1664
1665             initialized = true;
1666         }
1667     }
1668
1669     IMPLEMENT_REFCOUNTABLE();
1670
1671     cl_platform_id handle;
1672     String vendor;
1673     bool initialized;
1674 };
1675
1676 Platform::Platform()
1677 {
1678     p = 0;
1679 }
1680
1681 Platform::~Platform()
1682 {
1683     if(p)
1684         p->release();
1685 }
1686
1687 Platform::Platform(const Platform& pl)
1688 {
1689     p = (Impl*)pl.p;
1690     if(p)
1691         p->addref();
1692 }
1693
1694 Platform& Platform::operator = (const Platform& pl)
1695 {
1696     Impl* newp = (Impl*)pl.p;
1697     if(newp)
1698         newp->addref();
1699     if(p)
1700         p->release();
1701     p = newp;
1702     return *this;
1703 }
1704
1705 void* Platform::ptr() const
1706 {
1707     return p ? p->handle : 0;
1708 }
1709
1710 Platform& Platform::getDefault()
1711 {
1712     static Platform p;
1713     if( !p.p )
1714     {
1715         p.p = new Impl;
1716         p.p->init();
1717     }
1718     return p;
1719 }
1720
1721 /////////////////////////////////////// Device ////////////////////////////////////////////
1722
1723 // deviceVersion has format
1724 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1725 // by specification
1726 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1727 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1728 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1729 {
1730     major = minor = 0;
1731     if (10 >= deviceVersion.length())
1732         return;
1733     const char *pstr = deviceVersion.c_str();
1734     if (0 != strncmp(pstr, "OpenCL ", 7))
1735         return;
1736     size_t ppos = deviceVersion.find('.', 7);
1737     if (String::npos == ppos)
1738         return;
1739     String temp = deviceVersion.substr(7, ppos - 7);
1740     major = atoi(temp.c_str());
1741     temp = deviceVersion.substr(ppos + 1);
1742     minor = atoi(temp.c_str());
1743 }
1744
1745 struct Device::Impl
1746 {
1747     Impl(void* d)
1748     {
1749         handle = (cl_device_id)d;
1750         refcount = 1;
1751
1752         name_ = getStrProp(CL_DEVICE_NAME);
1753         version_ = getStrProp(CL_DEVICE_VERSION);
1754         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1755         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1756         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1757         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1758         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1759         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1760
1761         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1762         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1763
1764         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1765         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1766             vendorName_ == "AMD")
1767             vendorID_ = VENDOR_AMD;
1768         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1769             vendorID_ = VENDOR_INTEL;
1770         else if (vendorName_ == "NVIDIA Corporation")
1771             vendorID_ = VENDOR_NVIDIA;
1772         else
1773             vendorID_ = UNKNOWN_VENDOR;
1774     }
1775
1776     template<typename _TpCL, typename _TpOut>
1777     _TpOut getProp(cl_device_info prop) const
1778     {
1779         _TpCL temp=_TpCL();
1780         size_t sz = 0;
1781
1782         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1783             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1784     }
1785
1786     bool getBoolProp(cl_device_info prop) const
1787     {
1788         cl_bool temp = CL_FALSE;
1789         size_t sz = 0;
1790
1791         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1792             sz == sizeof(temp) ? temp != 0 : false;
1793     }
1794
1795     String getStrProp(cl_device_info prop) const
1796     {
1797         char buf[1024];
1798         size_t sz=0;
1799         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1800             sz < sizeof(buf) ? String(buf) : String();
1801     }
1802
1803     IMPLEMENT_REFCOUNTABLE();
1804     cl_device_id handle;
1805
1806     String name_;
1807     String version_;
1808     int doubleFPConfig_;
1809     bool hostUnifiedMemory_;
1810     int maxComputeUnits_;
1811     size_t maxWorkGroupSize_;
1812     int type_;
1813     int deviceVersionMajor_;
1814     int deviceVersionMinor_;
1815     String driverVersion_;
1816     String vendorName_;
1817     int vendorID_;
1818 };
1819
1820
1821 Device::Device()
1822 {
1823     p = 0;
1824 }
1825
1826 Device::Device(void* d)
1827 {
1828     p = 0;
1829     set(d);
1830 }
1831
1832 Device::Device(const Device& d)
1833 {
1834     p = d.p;
1835     if(p)
1836         p->addref();
1837 }
1838
1839 Device& Device::operator = (const Device& d)
1840 {
1841     Impl* newp = (Impl*)d.p;
1842     if(newp)
1843         newp->addref();
1844     if(p)
1845         p->release();
1846     p = newp;
1847     return *this;
1848 }
1849
1850 Device::~Device()
1851 {
1852     if(p)
1853         p->release();
1854 }
1855
1856 void Device::set(void* d)
1857 {
1858     if(p)
1859         p->release();
1860     p = new Impl(d);
1861 }
1862
1863 void* Device::ptr() const
1864 {
1865     return p ? p->handle : 0;
1866 }
1867
1868 String Device::name() const
1869 { return p ? p->name_ : String(); }
1870
1871 String Device::extensions() const
1872 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1873
1874 String Device::version() const
1875 { return p ? p->version_ : String(); }
1876
1877 String Device::vendorName() const
1878 { return p ? p->vendorName_ : String(); }
1879
1880 int Device::vendorID() const
1881 { return p ? p->vendorID_ : 0; }
1882
1883 String Device::OpenCL_C_Version() const
1884 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1885
1886 String Device::OpenCLVersion() const
1887 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1888
1889 int Device::deviceVersionMajor() const
1890 { return p ? p->deviceVersionMajor_ : 0; }
1891
1892 int Device::deviceVersionMinor() const
1893 { return p ? p->deviceVersionMinor_ : 0; }
1894
1895 String Device::driverVersion() const
1896 { return p ? p->driverVersion_ : String(); }
1897
1898 int Device::type() const
1899 { return p ? p->type_ : 0; }
1900
1901 int Device::addressBits() const
1902 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1903
1904 bool Device::available() const
1905 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1906
1907 bool Device::compilerAvailable() const
1908 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1909
1910 bool Device::linkerAvailable() const
1911 #ifdef CL_VERSION_1_2
1912 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1913 #else
1914 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1915 #endif
1916
1917 int Device::doubleFPConfig() const
1918 { return p ? p->doubleFPConfig_ : 0; }
1919
1920 int Device::singleFPConfig() const
1921 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1922
1923 int Device::halfFPConfig() const
1924 #ifdef CL_VERSION_1_2
1925 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1926 #else
1927 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1928 #endif
1929
1930 bool Device::endianLittle() const
1931 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1932
1933 bool Device::errorCorrectionSupport() const
1934 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1935
1936 int Device::executionCapabilities() const
1937 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1938
1939 size_t Device::globalMemCacheSize() const
1940 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1941
1942 int Device::globalMemCacheType() const
1943 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1944
1945 int Device::globalMemCacheLineSize() const
1946 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1947
1948 size_t Device::globalMemSize() const
1949 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1950
1951 size_t Device::localMemSize() const
1952 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1953
1954 int Device::localMemType() const
1955 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1956
1957 bool Device::hostUnifiedMemory() const
1958 { return p ? p->hostUnifiedMemory_ : false; }
1959
1960 bool Device::imageSupport() const
1961 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1962
1963 bool Device::imageFromBufferSupport() const
1964 {
1965     bool ret = false;
1966     if (p)
1967     {
1968         size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
1969         if (pos != String::npos)
1970         {
1971             ret = true;
1972         }
1973     }
1974     return ret;
1975 }
1976
1977 uint Device::imagePitchAlignment() const
1978 {
1979 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1980     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1981 #else
1982     return 0;
1983 #endif
1984 }
1985
1986 uint Device::imageBaseAddressAlignment() const
1987 {
1988 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1989     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1990 #else
1991     return 0;
1992 #endif
1993 }
1994
1995 size_t Device::image2DMaxWidth() const
1996 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1997
1998 size_t Device::image2DMaxHeight() const
1999 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
2000
2001 size_t Device::image3DMaxWidth() const
2002 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
2003
2004 size_t Device::image3DMaxHeight() const
2005 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
2006
2007 size_t Device::image3DMaxDepth() const
2008 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
2009
2010 size_t Device::imageMaxBufferSize() const
2011 #ifdef CL_VERSION_1_2
2012 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
2013 #else
2014 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2015 #endif
2016
2017 size_t Device::imageMaxArraySize() const
2018 #ifdef CL_VERSION_1_2
2019 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
2020 #else
2021 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2022 #endif
2023
2024 int Device::maxClockFrequency() const
2025 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
2026
2027 int Device::maxComputeUnits() const
2028 { return p ? p->maxComputeUnits_ : 0; }
2029
2030 int Device::maxConstantArgs() const
2031 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
2032
2033 size_t Device::maxConstantBufferSize() const
2034 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
2035
2036 size_t Device::maxMemAllocSize() const
2037 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
2038
2039 size_t Device::maxParameterSize() const
2040 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
2041
2042 int Device::maxReadImageArgs() const
2043 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2044
2045 int Device::maxWriteImageArgs() const
2046 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2047
2048 int Device::maxSamplers() const
2049 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2050
2051 size_t Device::maxWorkGroupSize() const
2052 { return p ? p->maxWorkGroupSize_ : 0; }
2053
2054 int Device::maxWorkItemDims() const
2055 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2056
2057 void Device::maxWorkItemSizes(size_t* sizes) const
2058 {
2059     if(p)
2060     {
2061         const int MAX_DIMS = 32;
2062         size_t retsz = 0;
2063         CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2064                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2065     }
2066 }
2067
2068 int Device::memBaseAddrAlign() const
2069 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2070
2071 int Device::nativeVectorWidthChar() const
2072 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2073
2074 int Device::nativeVectorWidthShort() const
2075 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2076
2077 int Device::nativeVectorWidthInt() const
2078 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2079
2080 int Device::nativeVectorWidthLong() const
2081 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2082
2083 int Device::nativeVectorWidthFloat() const
2084 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2085
2086 int Device::nativeVectorWidthDouble() const
2087 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2088
2089 int Device::nativeVectorWidthHalf() const
2090 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2091
2092 int Device::preferredVectorWidthChar() const
2093 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2094
2095 int Device::preferredVectorWidthShort() const
2096 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2097
2098 int Device::preferredVectorWidthInt() const
2099 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2100
2101 int Device::preferredVectorWidthLong() const
2102 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2103
2104 int Device::preferredVectorWidthFloat() const
2105 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2106
2107 int Device::preferredVectorWidthDouble() const
2108 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2109
2110 int Device::preferredVectorWidthHalf() const
2111 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2112
2113 size_t Device::printfBufferSize() const
2114 #ifdef CL_VERSION_1_2
2115 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2116 #else
2117 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2118 #endif
2119
2120
2121 size_t Device::profilingTimerResolution() const
2122 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2123
2124 const Device& Device::getDefault()
2125 {
2126     const Context& ctx = Context::getDefault();
2127     int idx = coreTlsData.get()->device;
2128     return ctx.device(idx);
2129 }
2130
2131 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2132
2133 template <typename Functor, typename ObjectType>
2134 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2135 {
2136     ::size_t required;
2137     cl_int err = f(obj, name, 0, NULL, &required);
2138     if (err != CL_SUCCESS)
2139         return err;
2140
2141     param.clear();
2142     if (required > 0)
2143     {
2144         AutoBuffer<char> buf(required + 1);
2145         char* ptr = (char*)buf; // cleanup is not needed
2146         err = f(obj, name, required, ptr, NULL);
2147         if (err != CL_SUCCESS)
2148             return err;
2149         param = ptr;
2150     }
2151
2152     return CL_SUCCESS;
2153 }
2154
2155 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2156 {
2157     elems.clear();
2158     if (s.size() == 0)
2159         return;
2160     std::istringstream ss(s);
2161     std::string item;
2162     while (!ss.eof())
2163     {
2164         std::getline(ss, item, delim);
2165         elems.push_back(item);
2166     }
2167 }
2168
2169 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2170 // Sample: AMD:GPU:
2171 // Sample: AMD:GPU:Tahiti
2172 // Sample: :GPU|CPU: = '' = ':' = '::'
2173 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2174         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2175 {
2176     std::vector<std::string> parts;
2177     split(configurationStr, ':', parts);
2178     if (parts.size() > 3)
2179     {
2180         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2181         return false;
2182     }
2183     if (parts.size() > 2)
2184         deviceNameOrID = parts[2];
2185     if (parts.size() > 1)
2186     {
2187         split(parts[1], '|', deviceTypes);
2188     }
2189     if (parts.size() > 0)
2190     {
2191         platform = parts[0];
2192     }
2193     return true;
2194 }
2195
2196 #ifdef HAVE_WINRT
2197 static cl_device_id selectOpenCLDevice()
2198 {
2199     return NULL;
2200 }
2201 #else
2202 static cl_device_id selectOpenCLDevice()
2203 {
2204     std::string platform, deviceName;
2205     std::vector<std::string> deviceTypes;
2206
2207     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2208     if (configuration && !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
2209         return NULL;
2210
2211     bool isID = false;
2212     int deviceID = -1;
2213     if (deviceName.length() == 1)
2214     // We limit ID range to 0..9, because we want to write:
2215     // - '2500' to mean i5-2500
2216     // - '8350' to mean AMD FX-8350
2217     // - '650' to mean GeForce 650
2218     // To extend ID range change condition to '> 0'
2219     {
2220         isID = true;
2221         for (size_t i = 0; i < deviceName.length(); i++)
2222         {
2223             if (!isdigit(deviceName[i]))
2224             {
2225                 isID = false;
2226                 break;
2227             }
2228         }
2229         if (isID)
2230         {
2231             deviceID = atoi(deviceName.c_str());
2232             if (deviceID < 0)
2233                 return NULL;
2234         }
2235     }
2236
2237     std::vector<cl_platform_id> platforms;
2238     {
2239         cl_uint numPlatforms = 0;
2240         CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2241
2242         if (numPlatforms == 0)
2243             return NULL;
2244         platforms.resize((size_t)numPlatforms);
2245         CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2246         platforms.resize(numPlatforms);
2247     }
2248
2249     int selectedPlatform = -1;
2250     if (platform.length() > 0)
2251     {
2252         for (size_t i = 0; i < platforms.size(); i++)
2253         {
2254             std::string name;
2255             CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2256             if (name.find(platform) != std::string::npos)
2257             {
2258                 selectedPlatform = (int)i;
2259                 break;
2260             }
2261         }
2262         if (selectedPlatform == -1)
2263         {
2264             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2265             goto not_found;
2266         }
2267     }
2268     if (deviceTypes.size() == 0)
2269     {
2270         if (!isID)
2271         {
2272             deviceTypes.push_back("GPU");
2273             if (configuration)
2274                 deviceTypes.push_back("CPU");
2275         }
2276         else
2277             deviceTypes.push_back("ALL");
2278     }
2279     for (size_t t = 0; t < deviceTypes.size(); t++)
2280     {
2281         int deviceType = 0;
2282         std::string tempStrDeviceType = deviceTypes[t];
2283         std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2284
2285         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2286             deviceType = Device::TYPE_GPU;
2287         else if (tempStrDeviceType == "cpu")
2288             deviceType = Device::TYPE_CPU;
2289         else if (tempStrDeviceType == "accelerator")
2290             deviceType = Device::TYPE_ACCELERATOR;
2291         else if (tempStrDeviceType == "all")
2292             deviceType = Device::TYPE_ALL;
2293         else
2294         {
2295             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2296             goto not_found;
2297         }
2298
2299         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2300         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2301                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2302                 i++)
2303         {
2304             cl_uint count = 0;
2305             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2306             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2307             if (count == 0)
2308                 continue;
2309             size_t base = devices.size();
2310             devices.resize(base + count);
2311             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2312             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2313         }
2314
2315         for (size_t i = (isID ? deviceID : 0);
2316              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2317              i++)
2318         {
2319             std::string name;
2320             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2321             cl_bool useGPU = true;
2322             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2323             {
2324                 cl_bool isIGPU = CL_FALSE;
2325                 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2326                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2327             }
2328             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2329             {
2330                 // TODO check for OpenCL 1.1
2331                 return devices[i];
2332             }
2333         }
2334     }
2335
2336 not_found:
2337     std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2338             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2339             << "    Device types: ";
2340     for (size_t t = 0; t < deviceTypes.size(); t++)
2341         std::cerr << deviceTypes[t] << " ";
2342
2343     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2344     CV_Error(CL_INVALID_DEVICE, "Requested OpenCL device is not found");
2345     return NULL;
2346 }
2347 #endif
2348
2349 struct Context::Impl
2350 {
2351     Impl()
2352     {
2353         refcount = 1;
2354         handle = 0;
2355     }
2356
2357     void setDefault()
2358     {
2359         CV_Assert(handle == NULL);
2360
2361         cl_device_id d = selectOpenCLDevice();
2362
2363         if (d == NULL)
2364             return;
2365
2366         cl_platform_id pl = NULL;
2367         CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2368
2369         cl_context_properties prop[] =
2370         {
2371             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2372             0
2373         };
2374
2375         // !!! in the current implementation force the number of devices to 1 !!!
2376         cl_uint nd = 1;
2377         cl_int status;
2378
2379         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2380
2381         bool ok = handle != 0 && status == CL_SUCCESS;
2382         if( ok )
2383         {
2384             devices.resize(nd);
2385             devices[0].set(d);
2386         }
2387         else
2388             handle = NULL;
2389     }
2390
2391     Impl(int dtype0)
2392     {
2393         refcount = 1;
2394         handle = 0;
2395
2396         cl_int retval = 0;
2397         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2398         cl_context_properties prop[] =
2399         {
2400             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2401             0
2402         };
2403
2404         cl_uint i, nd0 = 0, nd = 0;
2405         int dtype = dtype0 & 15;
2406         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2407
2408         AutoBuffer<void*> dlistbuf(nd0*2+1);
2409         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2410         cl_device_id* dlist_new = dlist + nd0;
2411         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2412         String name0;
2413
2414         for(i = 0; i < nd0; i++)
2415         {
2416             Device d(dlist[i]);
2417             if( !d.available() || !d.compilerAvailable() )
2418                 continue;
2419             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2420                 continue;
2421             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2422                 continue;
2423             String name = d.name();
2424             if( nd != 0 && name != name0 )
2425                 continue;
2426             name0 = name;
2427             dlist_new[nd++] = dlist[i];
2428         }
2429
2430         if(nd == 0)
2431             return;
2432
2433         // !!! in the current implementation force the number of devices to 1 !!!
2434         nd = 1;
2435
2436         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2437         bool ok = handle != 0 && retval == CL_SUCCESS;
2438         if( ok )
2439         {
2440             devices.resize(nd);
2441             for( i = 0; i < nd; i++ )
2442                 devices[i].set(dlist_new[i]);
2443         }
2444     }
2445
2446     ~Impl()
2447     {
2448         if(handle)
2449         {
2450             clReleaseContext(handle);
2451             handle = NULL;
2452         }
2453         devices.clear();
2454     }
2455
2456     Program getProg(const ProgramSource& src,
2457                     const String& buildflags, String& errmsg)
2458     {
2459         String prefix = Program::getPrefix(buildflags);
2460         HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2461         phash_t::iterator it = phash.find(k);
2462         if( it != phash.end() )
2463             return it->second;
2464         //String filename = format("%08x%08x_%08x%08x.clb2",
2465         Program prog(src, buildflags, errmsg);
2466         if(prog.ptr())
2467             phash.insert(std::pair<HashKey,Program>(k, prog));
2468         return prog;
2469     }
2470
2471     IMPLEMENT_REFCOUNTABLE();
2472
2473     cl_context handle;
2474     std::vector<Device> devices;
2475
2476     typedef ProgramSource::hash_t hash_t;
2477
2478     struct HashKey
2479     {
2480         HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
2481         bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
2482         bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
2483         bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2484         hash_t a, b;
2485     };
2486     typedef std::map<HashKey, Program> phash_t;
2487     phash_t phash;
2488 };
2489
2490
2491 Context::Context()
2492 {
2493     p = 0;
2494 }
2495
2496 Context::Context(int dtype)
2497 {
2498     p = 0;
2499     create(dtype);
2500 }
2501
2502 bool Context::create()
2503 {
2504     if( !haveOpenCL() )
2505         return false;
2506     if(p)
2507         p->release();
2508     p = new Impl();
2509     if(!p->handle)
2510     {
2511         delete p;
2512         p = 0;
2513     }
2514     return p != 0;
2515 }
2516
2517 bool Context::create(int dtype0)
2518 {
2519     if( !haveOpenCL() )
2520         return false;
2521     if(p)
2522         p->release();
2523     p = new Impl(dtype0);
2524     if(!p->handle)
2525     {
2526         delete p;
2527         p = 0;
2528     }
2529     return p != 0;
2530 }
2531
2532 Context::~Context()
2533 {
2534     if (p)
2535     {
2536         p->release();
2537         p = NULL;
2538     }
2539 }
2540
2541 Context::Context(const Context& c)
2542 {
2543     p = (Impl*)c.p;
2544     if(p)
2545         p->addref();
2546 }
2547
2548 Context& Context::operator = (const Context& c)
2549 {
2550     Impl* newp = (Impl*)c.p;
2551     if(newp)
2552         newp->addref();
2553     if(p)
2554         p->release();
2555     p = newp;
2556     return *this;
2557 }
2558
2559 void* Context::ptr() const
2560 {
2561     return p == NULL ? NULL : p->handle;
2562 }
2563
2564 size_t Context::ndevices() const
2565 {
2566     return p ? p->devices.size() : 0;
2567 }
2568
2569 const Device& Context::device(size_t idx) const
2570 {
2571     static Device dummy;
2572     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2573 }
2574
2575 Context& Context::getDefault(bool initialize)
2576 {
2577     static Context* ctx = new Context();
2578     if(!ctx->p && haveOpenCL())
2579     {
2580         if (!ctx->p)
2581             ctx->p = new Impl();
2582         if (initialize)
2583         {
2584             // do not create new Context right away.
2585             // First, try to retrieve existing context of the same type.
2586             // In its turn, Platform::getContext() may call Context::create()
2587             // if there is no such context.
2588             if (ctx->p->handle == NULL)
2589                 ctx->p->setDefault();
2590         }
2591     }
2592
2593     return *ctx;
2594 }
2595
2596 Program Context::getProg(const ProgramSource& prog,
2597                          const String& buildopts, String& errmsg)
2598 {
2599     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2600 }
2601
2602 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2603 {
2604     cl_context context = (cl_context)_context;
2605     cl_device_id device = (cl_device_id)_device;
2606
2607     // cleanup old context
2608     Context::Impl * impl = ctx.p;
2609     if (impl->handle)
2610     {
2611         CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2612     }
2613     impl->devices.clear();
2614
2615     impl->handle = context;
2616     impl->devices.resize(1);
2617     impl->devices[0].set(device);
2618
2619     Platform& p = Platform::getDefault();
2620     Platform::Impl* pImpl = p.p;
2621     pImpl->handle = (cl_platform_id)platform;
2622 }
2623
2624 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2625
2626 struct Queue::Impl
2627 {
2628     Impl(const Context& c, const Device& d)
2629     {
2630         refcount = 1;
2631         const Context* pc = &c;
2632         cl_context ch = (cl_context)pc->ptr();
2633         if( !ch )
2634         {
2635             pc = &Context::getDefault();
2636             ch = (cl_context)pc->ptr();
2637         }
2638         cl_device_id dh = (cl_device_id)d.ptr();
2639         if( !dh )
2640             dh = (cl_device_id)pc->device(0).ptr();
2641         cl_int retval = 0;
2642         handle = clCreateCommandQueue(ch, dh, 0, &retval);
2643         CV_OclDbgAssert(retval == CL_SUCCESS);
2644     }
2645
2646     ~Impl()
2647     {
2648 #ifdef _WIN32
2649         if (!cv::__termination)
2650 #endif
2651         {
2652             if(handle)
2653             {
2654                 clFinish(handle);
2655                 clReleaseCommandQueue(handle);
2656                 handle = NULL;
2657             }
2658         }
2659     }
2660
2661     IMPLEMENT_REFCOUNTABLE();
2662
2663     cl_command_queue handle;
2664 };
2665
2666 Queue::Queue()
2667 {
2668     p = 0;
2669 }
2670
2671 Queue::Queue(const Context& c, const Device& d)
2672 {
2673     p = 0;
2674     create(c, d);
2675 }
2676
2677 Queue::Queue(const Queue& q)
2678 {
2679     p = q.p;
2680     if(p)
2681         p->addref();
2682 }
2683
2684 Queue& Queue::operator = (const Queue& q)
2685 {
2686     Impl* newp = (Impl*)q.p;
2687     if(newp)
2688         newp->addref();
2689     if(p)
2690         p->release();
2691     p = newp;
2692     return *this;
2693 }
2694
2695 Queue::~Queue()
2696 {
2697     if(p)
2698         p->release();
2699 }
2700
2701 bool Queue::create(const Context& c, const Device& d)
2702 {
2703     if(p)
2704         p->release();
2705     p = new Impl(c, d);
2706     return p->handle != 0;
2707 }
2708
2709 void Queue::finish()
2710 {
2711     if(p && p->handle)
2712     {
2713         CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
2714     }
2715 }
2716
2717 void* Queue::ptr() const
2718 {
2719     return p ? p->handle : 0;
2720 }
2721
2722 Queue& Queue::getDefault()
2723 {
2724     Queue& q = coreTlsData.get()->oclQueue;
2725     if( !q.p && haveOpenCL() )
2726         q.create(Context::getDefault());
2727     return q;
2728 }
2729
2730 static cl_command_queue getQueue(const Queue& q)
2731 {
2732     cl_command_queue qq = (cl_command_queue)q.ptr();
2733     if(!qq)
2734         qq = (cl_command_queue)Queue::getDefault().ptr();
2735     return qq;
2736 }
2737
2738 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2739
2740 KernelArg::KernelArg()
2741     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2742 {
2743 }
2744
2745 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2746     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2747 {
2748 }
2749
2750 KernelArg KernelArg::Constant(const Mat& m)
2751 {
2752     CV_Assert(m.isContinuous());
2753     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
2754 }
2755
2756 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2757
2758 struct Kernel::Impl
2759 {
2760     Impl(const char* kname, const Program& prog) :
2761         refcount(1), e(0), nu(0)
2762     {
2763         cl_program ph = (cl_program)prog.ptr();
2764         cl_int retval = 0;
2765         handle = ph != 0 ?
2766             clCreateKernel(ph, kname, &retval) : 0;
2767         CV_OclDbgAssert(retval == CL_SUCCESS);
2768         for( int i = 0; i < MAX_ARRS; i++ )
2769             u[i] = 0;
2770         haveTempDstUMats = false;
2771     }
2772
2773     void cleanupUMats()
2774     {
2775         for( int i = 0; i < MAX_ARRS; i++ )
2776             if( u[i] )
2777             {
2778                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2779                     u[i]->currAllocator->deallocate(u[i]);
2780                 u[i] = 0;
2781             }
2782         nu = 0;
2783         haveTempDstUMats = false;
2784     }
2785
2786     void addUMat(const UMat& m, bool dst)
2787     {
2788         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2789         u[nu] = m.u;
2790         CV_XADD(&m.u->urefcount, 1);
2791         nu++;
2792         if(dst && m.u->tempUMat())
2793             haveTempDstUMats = true;
2794     }
2795
2796     void addImage(const Image2D& image)
2797     {
2798         images.push_back(image);
2799     }
2800
2801     void finit()
2802     {
2803         cleanupUMats();
2804         images.clear();
2805         if(e) { clReleaseEvent(e); e = 0; }
2806         release();
2807     }
2808
2809     ~Impl()
2810     {
2811         if(handle)
2812             clReleaseKernel(handle);
2813     }
2814
2815     IMPLEMENT_REFCOUNTABLE();
2816
2817     cl_kernel handle;
2818     cl_event e;
2819     enum { MAX_ARRS = 16 };
2820     UMatData* u[MAX_ARRS];
2821     int nu;
2822     std::list<Image2D> images;
2823     bool haveTempDstUMats;
2824 };
2825
2826 }}
2827
2828 extern "C"
2829 {
2830 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
2831 {
2832     ((cv::ocl::Kernel::Impl*)p)->finit();
2833 }
2834
2835 }
2836
2837 namespace cv { namespace ocl {
2838
2839 Kernel::Kernel()
2840 {
2841     p = 0;
2842 }
2843
2844 Kernel::Kernel(const char* kname, const Program& prog)
2845 {
2846     p = 0;
2847     create(kname, prog);
2848 }
2849
2850 Kernel::Kernel(const char* kname, const ProgramSource& src,
2851                const String& buildopts, String* errmsg)
2852 {
2853     p = 0;
2854     create(kname, src, buildopts, errmsg);
2855 }
2856
2857 Kernel::Kernel(const Kernel& k)
2858 {
2859     p = k.p;
2860     if(p)
2861         p->addref();
2862 }
2863
2864 Kernel& Kernel::operator = (const Kernel& k)
2865 {
2866     Impl* newp = (Impl*)k.p;
2867     if(newp)
2868         newp->addref();
2869     if(p)
2870         p->release();
2871     p = newp;
2872     return *this;
2873 }
2874
2875 Kernel::~Kernel()
2876 {
2877     if(p)
2878         p->release();
2879 }
2880
2881 bool Kernel::create(const char* kname, const Program& prog)
2882 {
2883     if(p)
2884         p->release();
2885     p = new Impl(kname, prog);
2886     if(p->handle == 0)
2887     {
2888         p->release();
2889         p = 0;
2890     }
2891     return p != 0;
2892 }
2893
2894 bool Kernel::create(const char* kname, const ProgramSource& src,
2895                     const String& buildopts, String* errmsg)
2896 {
2897     if(p)
2898     {
2899         p->release();
2900         p = 0;
2901     }
2902     String tempmsg;
2903     if( !errmsg ) errmsg = &tempmsg;
2904     const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2905     return create(kname, prog);
2906 }
2907
2908 void* Kernel::ptr() const
2909 {
2910     return p ? p->handle : 0;
2911 }
2912
2913 bool Kernel::empty() const
2914 {
2915     return ptr() == 0;
2916 }
2917
2918 int Kernel::set(int i, const void* value, size_t sz)
2919 {
2920     if (!p || !p->handle)
2921         return -1;
2922     if (i < 0)
2923         return i;
2924     if( i == 0 )
2925         p->cleanupUMats();
2926
2927     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2928     CV_OclDbgAssert(retval == CL_SUCCESS);
2929     if (retval != CL_SUCCESS)
2930         return -1;
2931     return i+1;
2932 }
2933
2934 int Kernel::set(int i, const Image2D& image2D)
2935 {
2936     p->addImage(image2D);
2937     cl_mem h = (cl_mem)image2D.ptr();
2938     return set(i, &h, sizeof(h));
2939 }
2940
2941 int Kernel::set(int i, const UMat& m)
2942 {
2943     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
2944 }
2945
2946 int Kernel::set(int i, const KernelArg& arg)
2947 {
2948     if( !p || !p->handle )
2949         return -1;
2950     if (i < 0)
2951         return i;
2952     if( i == 0 )
2953         p->cleanupUMats();
2954     if( arg.m )
2955     {
2956         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
2957                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
2958         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
2959         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
2960
2961         if (!h)
2962         {
2963             p->release();
2964             p = 0;
2965             return -1;
2966         }
2967
2968         if (ptronly)
2969             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h) == CL_SUCCESS);
2970         else if( arg.m->dims <= 2 )
2971         {
2972             UMat2D u2d(*arg.m);
2973             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2974             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
2975             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
2976             i += 3;
2977
2978             if( !(arg.flags & KernelArg::NO_SIZE) )
2979             {
2980                 int cols = u2d.cols*arg.wscale/arg.iwscale;
2981                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
2982                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
2983                 i += 2;
2984             }
2985         }
2986         else
2987         {
2988             UMat3D u3d(*arg.m);
2989             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2990             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
2991             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
2992             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
2993             i += 4;
2994             if( !(arg.flags & KernelArg::NO_SIZE) )
2995             {
2996                 int cols = u3d.cols*arg.wscale/arg.iwscale;
2997                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
2998                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
2999                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
3000                 i += 3;
3001             }
3002         }
3003         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3004         return i;
3005     }
3006     CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
3007     return i+1;
3008 }
3009
3010
3011 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3012                  bool sync, const Queue& q)
3013 {
3014     if(!p || !p->handle || p->e != 0)
3015         return false;
3016
3017     cl_command_queue qq = getQueue(q);
3018     size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
3019     size_t total = 1;
3020     CV_Assert(_globalsize != 0);
3021     for (int i = 0; i < dims; i++)
3022     {
3023         size_t val = _localsize ? _localsize[i] :
3024             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3025         CV_Assert( val > 0 );
3026         total *= _globalsize[i];
3027         globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
3028     }
3029     if( total == 0 )
3030         return true;
3031     if( p->haveTempDstUMats )
3032         sync = true;
3033     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
3034                                            offset, globalsize, _localsize, 0, 0,
3035                                            sync ? 0 : &p->e);
3036     if( sync || retval != CL_SUCCESS )
3037     {
3038         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3039         p->cleanupUMats();
3040     }
3041     else
3042     {
3043         p->addref();
3044         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3045     }
3046     return retval == CL_SUCCESS;
3047 }
3048
3049 bool Kernel::runTask(bool sync, const Queue& q)
3050 {
3051     if(!p || !p->handle || p->e != 0)
3052         return false;
3053
3054     cl_command_queue qq = getQueue(q);
3055     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3056     if( sync || retval != CL_SUCCESS )
3057     {
3058         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3059         p->cleanupUMats();
3060     }
3061     else
3062     {
3063         p->addref();
3064         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3065     }
3066     return retval == CL_SUCCESS;
3067 }
3068
3069
3070 size_t Kernel::workGroupSize() const
3071 {
3072     if(!p || !p->handle)
3073         return 0;
3074     size_t val = 0, retsz = 0;
3075     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3076     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3077                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3078 }
3079
3080 size_t Kernel::preferedWorkGroupSizeMultiple() const
3081 {
3082     if(!p || !p->handle)
3083         return 0;
3084     size_t val = 0, retsz = 0;
3085     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3086     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3087                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3088 }
3089
3090 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3091 {
3092     if(!p || !p->handle || !wsz)
3093         return 0;
3094     size_t retsz = 0;
3095     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3096     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3097                                     sizeof(wsz[0]*3), wsz, &retsz) == CL_SUCCESS;
3098 }
3099
3100 size_t Kernel::localMemSize() const
3101 {
3102     if(!p || !p->handle)
3103         return 0;
3104     size_t retsz = 0;
3105     cl_ulong val = 0;
3106     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3107     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3108                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3109 }
3110
3111 /////////////////////////////////////////// Program /////////////////////////////////////////////
3112
3113 struct Program::Impl
3114 {
3115     Impl(const ProgramSource& _src,
3116          const String& _buildflags, String& errmsg)
3117     {
3118         refcount = 1;
3119         const Context& ctx = Context::getDefault();
3120         src = _src;
3121         buildflags = _buildflags;
3122         const String& srcstr = src.source();
3123         const char* srcptr = srcstr.c_str();
3124         size_t srclen = srcstr.size();
3125         cl_int retval = 0;
3126
3127         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3128         if( handle && retval == CL_SUCCESS )
3129         {
3130             int i, n = (int)ctx.ndevices();
3131             AutoBuffer<void*> deviceListBuf(n+1);
3132             void** deviceList = deviceListBuf;
3133             for( i = 0; i < n; i++ )
3134                 deviceList[i] = ctx.device(i).ptr();
3135
3136             Device device = Device::getDefault();
3137             if (device.isAMD())
3138                 buildflags += " -D AMD_DEVICE";
3139             else if (device.isIntel())
3140                 buildflags += " -D INTEL_DEVICE";
3141
3142             retval = clBuildProgram(handle, n,
3143                                     (const cl_device_id*)deviceList,
3144                                     buildflags.c_str(), 0, 0);
3145 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3146             if( retval != CL_SUCCESS )
3147 #endif
3148             {
3149                 size_t retsz = 0;
3150                 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3151                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3152                 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3153                 {
3154                     AutoBuffer<char> bufbuf(retsz + 16);
3155                     char* buf = bufbuf;
3156                     buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3157                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3158                     if (buildInfo_retval == CL_SUCCESS)
3159                     {
3160                         // TODO It is useful to see kernel name & program file name also
3161                         errmsg = String(buf);
3162                         printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3163                         fflush(stdout);
3164                     }
3165                 }
3166                 if (retval != CL_SUCCESS && handle)
3167                 {
3168                     clReleaseProgram(handle);
3169                     handle = NULL;
3170                 }
3171             }
3172         }
3173     }
3174
3175     Impl(const String& _buf, const String& _buildflags)
3176     {
3177         refcount = 1;
3178         handle = 0;
3179         buildflags = _buildflags;
3180         if(_buf.empty())
3181             return;
3182         String prefix0 = Program::getPrefix(buildflags);
3183         const Context& ctx = Context::getDefault();
3184         const Device& dev = Device::getDefault();
3185         const char* pos0 = _buf.c_str();
3186         const char* pos1 = strchr(pos0, '\n');
3187         if(!pos1)
3188             return;
3189         const char* pos2 = strchr(pos1+1, '\n');
3190         if(!pos2)
3191             return;
3192         const char* pos3 = strchr(pos2+1, '\n');
3193         if(!pos3)
3194             return;
3195         size_t prefixlen = (pos3 - pos0)+1;
3196         String prefix(pos0, prefixlen);
3197         if( prefix != prefix0 )
3198             return;
3199         const uchar* bin = (uchar*)(pos3+1);
3200         void* devid = dev.ptr();
3201         size_t codelen = _buf.length() - prefixlen;
3202         cl_int binstatus = 0, retval = 0;
3203         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3204                                            &codelen, &bin, &binstatus, &retval);
3205         CV_OclDbgAssert(retval == CL_SUCCESS);
3206     }
3207
3208     String store()
3209     {
3210         if(!handle)
3211             return String();
3212         size_t progsz = 0, retsz = 0;
3213         String prefix = Program::getPrefix(buildflags);
3214         size_t prefixlen = prefix.length();
3215         if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3216             return String();
3217         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3218         uchar* buf = bufbuf;
3219         memcpy(buf, prefix.c_str(), prefixlen);
3220         buf += prefixlen;
3221         if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3222             return String();
3223         buf[progsz] = (uchar)'\0';
3224         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3225     }
3226
3227     ~Impl()
3228     {
3229         if( handle )
3230         {
3231 #ifdef _WIN32
3232             if (!cv::__termination)
3233 #endif
3234             {
3235                 clReleaseProgram(handle);
3236             }
3237             handle = NULL;
3238         }
3239     }
3240
3241     IMPLEMENT_REFCOUNTABLE();
3242
3243     ProgramSource src;
3244     String buildflags;
3245     cl_program handle;
3246 };
3247
3248
3249 Program::Program() { p = 0; }
3250
3251 Program::Program(const ProgramSource& src,
3252         const String& buildflags, String& errmsg)
3253 {
3254     p = 0;
3255     create(src, buildflags, errmsg);
3256 }
3257
3258 Program::Program(const Program& prog)
3259 {
3260     p = prog.p;
3261     if(p)
3262         p->addref();
3263 }
3264
3265 Program& Program::operator = (const Program& prog)
3266 {
3267     Impl* newp = (Impl*)prog.p;
3268     if(newp)
3269         newp->addref();
3270     if(p)
3271         p->release();
3272     p = newp;
3273     return *this;
3274 }
3275
3276 Program::~Program()
3277 {
3278     if(p)
3279         p->release();
3280 }
3281
3282 bool Program::create(const ProgramSource& src,
3283             const String& buildflags, String& errmsg)
3284 {
3285     if(p)
3286         p->release();
3287     p = new Impl(src, buildflags, errmsg);
3288     if(!p->handle)
3289     {
3290         p->release();
3291         p = 0;
3292     }
3293     return p != 0;
3294 }
3295
3296 const ProgramSource& Program::source() const
3297 {
3298     static ProgramSource dummy;
3299     return p ? p->src : dummy;
3300 }
3301
3302 void* Program::ptr() const
3303 {
3304     return p ? p->handle : 0;
3305 }
3306
3307 bool Program::read(const String& bin, const String& buildflags)
3308 {
3309     if(p)
3310         p->release();
3311     p = new Impl(bin, buildflags);
3312     return p->handle != 0;
3313 }
3314
3315 bool Program::write(String& bin) const
3316 {
3317     if(!p)
3318         return false;
3319     bin = p->store();
3320     return !bin.empty();
3321 }
3322
3323 String Program::getPrefix() const
3324 {
3325     if(!p)
3326         return String();
3327     return getPrefix(p->buildflags);
3328 }
3329
3330 String Program::getPrefix(const String& buildflags)
3331 {
3332     const Context& ctx = Context::getDefault();
3333     const Device& dev = ctx.device(0);
3334     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3335                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3336 }
3337
3338 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3339
3340 struct ProgramSource::Impl
3341 {
3342     Impl(const char* _src)
3343     {
3344         init(String(_src));
3345     }
3346     Impl(const String& _src)
3347     {
3348         init(_src);
3349     }
3350     void init(const String& _src)
3351     {
3352         refcount = 1;
3353         src = _src;
3354         h = crc64((uchar*)src.c_str(), src.size());
3355     }
3356
3357     IMPLEMENT_REFCOUNTABLE();
3358     String src;
3359     ProgramSource::hash_t h;
3360 };
3361
3362
3363 ProgramSource::ProgramSource()
3364 {
3365     p = 0;
3366 }
3367
3368 ProgramSource::ProgramSource(const char* prog)
3369 {
3370     p = new Impl(prog);
3371 }
3372
3373 ProgramSource::ProgramSource(const String& prog)
3374 {
3375     p = new Impl(prog);
3376 }
3377
3378 ProgramSource::~ProgramSource()
3379 {
3380     if(p)
3381         p->release();
3382 }
3383
3384 ProgramSource::ProgramSource(const ProgramSource& prog)
3385 {
3386     p = prog.p;
3387     if(p)
3388         p->addref();
3389 }
3390
3391 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3392 {
3393     Impl* newp = (Impl*)prog.p;
3394     if(newp)
3395         newp->addref();
3396     if(p)
3397         p->release();
3398     p = newp;
3399     return *this;
3400 }
3401
3402 const String& ProgramSource::source() const
3403 {
3404     static String dummy;
3405     return p ? p->src : dummy;
3406 }
3407
3408 ProgramSource::hash_t ProgramSource::hash() const
3409 {
3410     return p ? p->h : 0;
3411 }
3412
3413 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3414
3415 class OpenCLBufferPool
3416 {
3417 protected:
3418     ~OpenCLBufferPool() { }
3419 public:
3420     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity) = 0;
3421     virtual void release(cl_mem handle, size_t capacity) = 0;
3422 };
3423
3424 class OpenCLBufferPoolImpl : public BufferPoolController, public OpenCLBufferPool
3425 {
3426 public:
3427     struct BufferEntry
3428     {
3429         cl_mem clBuffer_;
3430         size_t capacity_;
3431     };
3432 protected:
3433     Mutex mutex_;
3434
3435     size_t currentReservedSize;
3436     size_t maxReservedSize;
3437
3438     std::list<BufferEntry> reservedEntries_; // LRU order
3439
3440     // synchronized
3441     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3442     {
3443         if (reservedEntries_.empty())
3444             return false;
3445         std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3446         std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3447         BufferEntry result = {NULL, 0};
3448         size_t minDiff = (size_t)(-1);
3449         for (; i != reservedEntries_.end(); ++i)
3450         {
3451             BufferEntry& e = *i;
3452             if (e.capacity_ >= size)
3453             {
3454                 size_t diff = e.capacity_ - size;
3455                 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3456                 {
3457                     minDiff = diff;
3458                     result_pos = i;
3459                     result = e;
3460                     if (diff == 0)
3461                         break;
3462                 }
3463             }
3464         }
3465         if (result_pos != reservedEntries_.end())
3466         {
3467             //CV_DbgAssert(result == *result_pos);
3468             reservedEntries_.erase(result_pos);
3469             entry = result;
3470             currentReservedSize -= entry.capacity_;
3471             return true;
3472         }
3473         return false;
3474     }
3475
3476     // synchronized
3477     void _checkSizeOfReservedEntries()
3478     {
3479         while (currentReservedSize > maxReservedSize)
3480         {
3481             CV_DbgAssert(!reservedEntries_.empty());
3482             const BufferEntry& entry = reservedEntries_.back();
3483             CV_DbgAssert(currentReservedSize >= entry.capacity_);
3484             currentReservedSize -= entry.capacity_;
3485             _releaseBufferEntry(entry);
3486             reservedEntries_.pop_back();
3487         }
3488     }
3489
3490     inline size_t _allocationGranularity(size_t size)
3491     {
3492         // heuristic values
3493         if (size < 1024)
3494             return 16;
3495         else if (size < 64*1024)
3496             return 64;
3497         else if (size < 1024*1024)
3498             return 4096;
3499         else if (size < 16*1024*1024)
3500             return 64*1024;
3501         else
3502             return 1024*1024;
3503     }
3504
3505     void _allocateBufferEntry(BufferEntry& entry, size_t size)
3506     {
3507         CV_DbgAssert(entry.clBuffer_ == NULL);
3508         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3509         Context& ctx = Context::getDefault();
3510         cl_int retval = CL_SUCCESS;
3511         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE, entry.capacity_, 0, &retval);
3512         CV_Assert(retval == CL_SUCCESS);
3513         CV_Assert(entry.clBuffer_ != NULL);
3514         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3515                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3516     }
3517
3518     void _releaseBufferEntry(const BufferEntry& entry)
3519     {
3520         CV_Assert(entry.capacity_ != 0);
3521         CV_Assert(entry.clBuffer_ != NULL);
3522         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
3523                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
3524         clReleaseMemObject(entry.clBuffer_);
3525     }
3526 public:
3527     OpenCLBufferPoolImpl()
3528         : currentReservedSize(0), maxReservedSize(0)
3529     {
3530         int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
3531         maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
3532     }
3533     virtual ~OpenCLBufferPoolImpl()
3534     {
3535         freeAllReservedBuffers();
3536         CV_Assert(reservedEntries_.empty());
3537     }
3538 public:
3539     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity)
3540     {
3541         BufferEntry entry = {NULL, 0};
3542         if (maxReservedSize > 0)
3543         {
3544             AutoLock locker(mutex_);
3545             if (_findAndRemoveEntryFromReservedList(entry, size))
3546             {
3547                 CV_DbgAssert(size <= entry.capacity_);
3548                 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3549                 capacity = entry.capacity_;
3550                 return entry.clBuffer_;
3551             }
3552         }
3553         _allocateBufferEntry(entry, size);
3554         capacity = entry.capacity_;
3555         return entry.clBuffer_;
3556     }
3557     virtual void release(cl_mem handle, size_t capacity)
3558     {
3559         BufferEntry entry = {handle, capacity};
3560         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3561         {
3562             _releaseBufferEntry(entry);
3563         }
3564         else
3565         {
3566             AutoLock locker(mutex_);
3567             reservedEntries_.push_front(entry);
3568             currentReservedSize += entry.capacity_;
3569             _checkSizeOfReservedEntries();
3570         }
3571     }
3572
3573     virtual size_t getReservedSize() const { return currentReservedSize; }
3574     virtual size_t getMaxReservedSize() const { return maxReservedSize; }
3575     virtual void setMaxReservedSize(size_t size)
3576     {
3577         AutoLock locker(mutex_);
3578         size_t oldMaxReservedSize = maxReservedSize;
3579         maxReservedSize = size;
3580         if (maxReservedSize < oldMaxReservedSize)
3581         {
3582             std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3583             for (; i != reservedEntries_.end();)
3584             {
3585                 const BufferEntry& entry = *i;
3586                 if (entry.capacity_ > maxReservedSize / 8)
3587                 {
3588                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
3589                     currentReservedSize -= entry.capacity_;
3590                     _releaseBufferEntry(entry);
3591                     i = reservedEntries_.erase(i);
3592                     continue;
3593                 }
3594                 ++i;
3595             }
3596             _checkSizeOfReservedEntries();
3597         }
3598     }
3599     virtual void freeAllReservedBuffers()
3600     {
3601         AutoLock locker(mutex_);
3602         std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3603         for (; i != reservedEntries_.end(); ++i)
3604         {
3605             const BufferEntry& entry = *i;
3606             _releaseBufferEntry(entry);
3607         }
3608         reservedEntries_.clear();
3609     }
3610 };
3611
3612 #if defined _MSC_VER
3613 #pragma warning(disable:4127) // conditional expression is constant
3614 #endif
3615 template <bool readAccess, bool writeAccess>
3616 class AlignedDataPtr
3617 {
3618 protected:
3619     const size_t size_;
3620     uchar* const originPtr_;
3621     const size_t alignment_;
3622     uchar* ptr_;
3623     uchar* allocatedPtr_;
3624
3625 public:
3626     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
3627         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
3628     {
3629         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
3630         if (((size_t)ptr_ & (alignment - 1)) != 0)
3631         {
3632             allocatedPtr_ = new uchar[size_ + alignment - 1];
3633             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
3634             if (readAccess)
3635             {
3636                 memcpy(ptr_, originPtr_, size_);
3637             }
3638         }
3639     }
3640
3641     uchar* getAlignedPtr() const
3642     {
3643         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
3644         return ptr_;
3645     }
3646
3647     ~AlignedDataPtr()
3648     {
3649         if (allocatedPtr_)
3650         {
3651             if (writeAccess)
3652             {
3653                 memcpy(originPtr_, ptr_, size_);
3654             }
3655             delete[] allocatedPtr_;
3656             allocatedPtr_ = NULL;
3657         }
3658         ptr_ = NULL;
3659     }
3660 private:
3661     AlignedDataPtr(const AlignedDataPtr&); // disabled
3662     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
3663 };
3664 #if defined _MSC_VER
3665 #pragma warning(default:4127) // conditional expression is constant
3666 #endif
3667
3668 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
3669 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
3670 #endif
3671
3672 class OpenCLAllocator : public MatAllocator
3673 {
3674     mutable OpenCLBufferPoolImpl bufferPool;
3675     enum AllocatorFlags
3676     {
3677         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0
3678     };
3679 public:
3680     OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
3681
3682     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
3683             int flags, UMatUsageFlags usageFlags) const
3684     {
3685         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
3686         return u;
3687     }
3688
3689     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
3690     {
3691         const Device& dev = ctx.device(0);
3692         createFlags = 0;
3693         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
3694             createFlags |= CL_MEM_ALLOC_HOST_PTR;
3695
3696         if( dev.hostUnifiedMemory() )
3697             flags0 = 0;
3698         else
3699             flags0 = UMatData::COPY_ON_MAP;
3700     }
3701
3702     UMatData* allocate(int dims, const int* sizes, int type,
3703                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
3704     {
3705         if(!useOpenCL())
3706             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3707         CV_Assert(data == 0);
3708         size_t total = CV_ELEM_SIZE(type);
3709         for( int i = dims-1; i >= 0; i-- )
3710         {
3711             if( step )
3712                 step[i] = total;
3713             total *= sizes[i];
3714         }
3715
3716         Context& ctx = Context::getDefault();
3717         int createFlags = 0, flags0 = 0;
3718         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
3719
3720         size_t capacity = 0;
3721         void* handle = NULL;
3722         int allocatorFlags = 0;
3723         if (createFlags == 0)
3724         {
3725             handle = bufferPool.allocate(total, capacity);
3726             if (!handle)
3727                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3728             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
3729         }
3730         else
3731         {
3732             capacity = total;
3733             cl_int retval = 0;
3734             handle = clCreateBuffer((cl_context)ctx.ptr(),
3735                                           CL_MEM_READ_WRITE|createFlags, total, 0, &retval);
3736             if( !handle || retval != CL_SUCCESS )
3737                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3738         }
3739         UMatData* u = new UMatData(this);
3740         u->data = 0;
3741         u->size = total;
3742         u->capacity = capacity;
3743         u->handle = handle;
3744         u->flags = flags0;
3745         u->allocatorFlags_ = allocatorFlags;
3746         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
3747         return u;
3748     }
3749
3750     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
3751     {
3752         if(!u)
3753             return false;
3754
3755         UMatDataAutoLock lock(u);
3756
3757         if(u->handle == 0)
3758         {
3759             CV_Assert(u->origdata != 0);
3760             Context& ctx = Context::getDefault();
3761             int createFlags = 0, flags0 = 0;
3762             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
3763
3764             cl_context ctx_handle = (cl_context)ctx.ptr();
3765             cl_int retval = 0;
3766             int tempUMatFlags = UMatData::TEMP_UMAT;
3767             u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,
3768                                        u->size, u->origdata, &retval);
3769             if((!u->handle || retval != CL_SUCCESS) && !(accessFlags & ACCESS_FAST))
3770             {
3771                 u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
3772                                            u->size, u->origdata, &retval);
3773                 tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
3774
3775             }
3776             if(!u->handle || retval != CL_SUCCESS)
3777                 return false;
3778             u->prevAllocator = u->currAllocator;
3779             u->currAllocator = this;
3780             u->flags |= tempUMatFlags;
3781         }
3782         if(accessFlags & ACCESS_WRITE)
3783             u->markHostCopyObsolete(true);
3784         return true;
3785     }
3786
3787     /*void sync(UMatData* u) const
3788     {
3789         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3790         UMatDataAutoLock lock(u);
3791
3792         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
3793         {
3794             if( u->tempCopiedUMat() )
3795             {
3796                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3797                                     u->size, u->origdata, 0, 0, 0);
3798             }
3799             else
3800             {
3801                 cl_int retval = 0;
3802                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3803                                                 (CL_MAP_READ | CL_MAP_WRITE),
3804                                                 0, u->size, 0, 0, 0, &retval);
3805                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
3806                 clFinish(q);
3807             }
3808             u->markHostCopyObsolete(false);
3809         }
3810         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
3811         {
3812             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3813                                  u->size, u->data, 0, 0, 0);
3814         }
3815     }*/
3816
3817     void deallocate(UMatData* u) const
3818     {
3819         if(!u)
3820             return;
3821
3822         CV_Assert(u->urefcount >= 0);
3823         CV_Assert(u->refcount >= 0);
3824
3825         // TODO: !!! when we add Shared Virtual Memory Support,
3826         // this function (as well as the others) should be corrected
3827         CV_Assert(u->handle != 0 && u->urefcount == 0);
3828         if(u->tempUMat())
3829         {
3830 //            UMatDataAutoLock lock(u);
3831             if( u->hostCopyObsolete() && u->refcount > 0 )
3832             {
3833                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3834                 if( u->tempCopiedUMat() )
3835                 {
3836                     AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3837                     CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3838                                         u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
3839                 }
3840                 else
3841                 {
3842                     cl_int retval = 0;
3843                     void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3844                                                     (CL_MAP_READ | CL_MAP_WRITE),
3845                                                     0, u->size, 0, 0, 0, &retval);
3846                     CV_OclDbgAssert(retval == CL_SUCCESS);
3847                     CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
3848                     CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3849                 }
3850             }
3851             u->markHostCopyObsolete(false);
3852             clReleaseMemObject((cl_mem)u->handle);
3853             u->handle = 0;
3854             u->currAllocator = u->prevAllocator;
3855             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3856                 fastFree(u->data);
3857             u->data = u->origdata;
3858             if(u->refcount == 0)
3859                 u->currAllocator->deallocate(u);
3860         }
3861         else
3862         {
3863             CV_Assert(u->refcount == 0);
3864             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3865             {
3866                 fastFree(u->data);
3867                 u->data = 0;
3868             }
3869             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
3870             {
3871                 bufferPool.release((cl_mem)u->handle, u->capacity);
3872             }
3873             else
3874             {
3875                 clReleaseMemObject((cl_mem)u->handle);
3876             }
3877             u->handle = 0;
3878             u->capacity = 0;
3879             delete u;
3880         }
3881     }
3882
3883     void map(UMatData* u, int accessFlags) const
3884     {
3885         if(!u)
3886             return;
3887
3888         CV_Assert( u->handle != 0 );
3889
3890         UMatDataAutoLock autolock(u);
3891
3892         if(accessFlags & ACCESS_WRITE)
3893             u->markDeviceCopyObsolete(true);
3894
3895         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3896
3897         // FIXIT Workaround for UMat synchronization issue
3898         // if( u->refcount == 0 )
3899         {
3900             if( !u->copyOnMap() )
3901             {
3902                 if (u->data) // FIXIT Workaround for UMat synchronization issue
3903                 {
3904                     //CV_Assert(u->hostCopyObsolete() == false);
3905                     return;
3906                 }
3907                 // because there can be other map requests for the same UMat with different access flags,
3908                 // we use the universal (read-write) access mode.
3909                 cl_int retval = 0;
3910                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3911                                                      (CL_MAP_READ | CL_MAP_WRITE),
3912                                                      0, u->size, 0, 0, 0, &retval);
3913                 if(u->data && retval == CL_SUCCESS)
3914                 {
3915                     u->markHostCopyObsolete(false);
3916                     u->markDeviceMemMapped(true);
3917                     return;
3918                 }
3919
3920                 // if map failed, switch to copy-on-map mode for the particular buffer
3921                 u->flags |= UMatData::COPY_ON_MAP;
3922             }
3923
3924             if(!u->data)
3925             {
3926                 u->data = (uchar*)fastMalloc(u->size);
3927                 u->markHostCopyObsolete(true);
3928             }
3929         }
3930
3931         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
3932         {
3933             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3934             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3935                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
3936             u->markHostCopyObsolete(false);
3937         }
3938     }
3939
3940     void unmap(UMatData* u) const
3941     {
3942         if(!u)
3943             return;
3944
3945
3946         CV_Assert(u->handle != 0);
3947
3948         UMatDataAutoLock autolock(u);
3949
3950         // FIXIT Workaround for UMat synchronization issue
3951         if(u->refcount > 0)
3952             return;
3953
3954         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3955         cl_int retval = 0;
3956         if( !u->copyOnMap() && u->deviceMemMapped() )
3957         {
3958             CV_Assert(u->data != NULL);
3959             u->markDeviceMemMapped(false);
3960             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
3961                                 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
3962             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3963             u->data = 0;
3964         }
3965         else if( u->copyOnMap() && u->deviceCopyObsolete() )
3966         {
3967             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3968             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3969                                 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
3970         }
3971         u->markDeviceCopyObsolete(false);
3972         u->markHostCopyObsolete(false);
3973     }
3974
3975     bool checkContinuous(int dims, const size_t sz[],
3976                          const size_t srcofs[], const size_t srcstep[],
3977                          const size_t dstofs[], const size_t dststep[],
3978                          size_t& total, size_t new_sz[],
3979                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
3980                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
3981     {
3982         bool iscontinuous = true;
3983         srcrawofs = srcofs ? srcofs[dims-1] : 0;
3984         dstrawofs = dstofs ? dstofs[dims-1] : 0;
3985         total = sz[dims-1];
3986         for( int i = dims-2; i >= 0; i-- )
3987         {
3988             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
3989                 iscontinuous = false;
3990             total *= sz[i];
3991             if( srcofs )
3992                 srcrawofs += srcofs[i]*srcstep[i];
3993             if( dstofs )
3994                 dstrawofs += dstofs[i]*dststep[i];
3995         }
3996
3997         if( !iscontinuous )
3998         {
3999             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
4000             if( dims == 2 )
4001             {
4002                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
4003                 // we assume that new_... arrays are initialized by caller
4004                 // with 0's, so there is no else branch
4005                 if( srcofs )
4006                 {
4007                     new_srcofs[0] = srcofs[1];
4008                     new_srcofs[1] = srcofs[0];
4009                     new_srcofs[2] = 0;
4010                 }
4011
4012                 if( dstofs )
4013                 {
4014                     new_dstofs[0] = dstofs[1];
4015                     new_dstofs[1] = dstofs[0];
4016                     new_dstofs[2] = 0;
4017                 }
4018
4019                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
4020                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
4021             }
4022             else
4023             {
4024                 // we could check for dims == 3 here,
4025                 // but from user perspective this one is more informative
4026                 CV_Assert(dims <= 3);
4027                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
4028                 if( srcofs )
4029                 {
4030                     new_srcofs[0] = srcofs[2];
4031                     new_srcofs[1] = srcofs[1];
4032                     new_srcofs[2] = srcofs[0];
4033                 }
4034
4035                 if( dstofs )
4036                 {
4037                     new_dstofs[0] = dstofs[2];
4038                     new_dstofs[1] = dstofs[1];
4039                     new_dstofs[2] = dstofs[0];
4040                 }
4041
4042                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
4043                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
4044             }
4045         }
4046         return iscontinuous;
4047     }
4048
4049     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4050                   const size_t srcofs[], const size_t srcstep[],
4051                   const size_t dststep[]) const
4052     {
4053         if(!u)
4054             return;
4055         UMatDataAutoLock autolock(u);
4056
4057         if( u->data && !u->hostCopyObsolete() )
4058         {
4059             Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4060             return;
4061         }
4062         CV_Assert( u->handle != 0 );
4063
4064         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4065
4066         size_t total = 0, new_sz[] = {0, 0, 0};
4067         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4068         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4069
4070         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4071                                             total, new_sz,
4072                                             srcrawofs, new_srcofs, new_srcstep,
4073                                             dstrawofs, new_dstofs, new_dststep);
4074
4075         AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4076         if( iscontinuous )
4077         {
4078             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4079                                            srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4080         }
4081         else
4082         {
4083             CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4084                             new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4085                             new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4086         }
4087     }
4088
4089     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4090                 const size_t dstofs[], const size_t dststep[],
4091                 const size_t srcstep[]) const
4092     {
4093         if(!u)
4094             return;
4095
4096         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4097         CV_Assert(u->refcount == 0 || u->tempUMat());
4098
4099         size_t total = 0, new_sz[] = {0, 0, 0};
4100         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4101         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4102
4103         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4104                                             total, new_sz,
4105                                             srcrawofs, new_srcofs, new_srcstep,
4106                                             dstrawofs, new_dstofs, new_dststep);
4107
4108         UMatDataAutoLock autolock(u);
4109
4110         // if there is cached CPU copy of the GPU matrix,
4111         // we could use it as a destination.
4112         // we can do it in 2 cases:
4113         //    1. we overwrite the whole content
4114         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
4115         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4116         {
4117             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4118             u->markHostCopyObsolete(false);
4119             u->markDeviceCopyObsolete(true);
4120             return;
4121         }
4122
4123         CV_Assert( u->handle != 0 );
4124         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4125
4126         AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4127         if( iscontinuous )
4128         {
4129             CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4130                 CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) == CL_SUCCESS );
4131         }
4132         else
4133         {
4134             CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4135                 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4136                 new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) == CL_SUCCESS );
4137         }
4138
4139         u->markHostCopyObsolete(true);
4140         u->markDeviceCopyObsolete(false);
4141     }
4142
4143     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4144               const size_t srcofs[], const size_t srcstep[],
4145               const size_t dstofs[], const size_t dststep[], bool _sync) const
4146     {
4147         if(!src || !dst)
4148             return;
4149
4150         size_t total = 0, new_sz[] = {0, 0, 0};
4151         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4152         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4153
4154         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4155                                             total, new_sz,
4156                                             srcrawofs, new_srcofs, new_srcstep,
4157                                             dstrawofs, new_dstofs, new_dststep);
4158
4159         UMatDataAutoLock src_autolock(src);
4160         UMatDataAutoLock dst_autolock(dst);
4161
4162         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
4163         {
4164             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
4165             return;
4166         }
4167         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
4168         {
4169             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
4170             dst->markHostCopyObsolete(false);
4171             dst->markDeviceCopyObsolete(true);
4172             return;
4173         }
4174
4175         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4176         CV_Assert(dst->refcount == 0);
4177         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4178
4179         if( iscontinuous )
4180         {
4181             CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4182                                            srcrawofs, dstrawofs, total, 0, 0, 0) == CL_SUCCESS );
4183         }
4184         else
4185         {
4186             cl_int retval;
4187             CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4188                                                new_srcofs, new_dstofs, new_sz,
4189                                                new_srcstep[0], new_srcstep[1],
4190                                                new_dststep[0], new_dststep[1],
4191                                                0, 0, 0)) == CL_SUCCESS );
4192         }
4193
4194         dst->markHostCopyObsolete(true);
4195         dst->markDeviceCopyObsolete(false);
4196
4197         if( _sync )
4198         {
4199             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4200         }
4201     }
4202
4203     BufferPoolController* getBufferPoolController() const { return &bufferPool; }
4204
4205     MatAllocator* matStdAllocator;
4206 };
4207
4208 MatAllocator* getOpenCLAllocator()
4209 {
4210     static MatAllocator * allocator = new OpenCLAllocator();
4211     return allocator;
4212 }
4213
4214 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
4215
4216 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
4217 {
4218     cl_uint numDevices = 0;
4219     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4220                                 0, NULL, &numDevices) == CL_SUCCESS);
4221
4222     if (numDevices == 0)
4223     {
4224         devices.clear();
4225         return;
4226     }
4227
4228     devices.resize((size_t)numDevices);
4229     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4230                                 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
4231 }
4232
4233 struct PlatformInfo::Impl
4234 {
4235     Impl(void* id)
4236     {
4237         refcount = 1;
4238         handle = *(cl_platform_id*)id;
4239         getDevices(devices, handle);
4240     }
4241
4242     String getStrProp(cl_device_info prop) const
4243     {
4244         char buf[1024];
4245         size_t sz=0;
4246         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
4247             sz < sizeof(buf) ? String(buf) : String();
4248     }
4249
4250     IMPLEMENT_REFCOUNTABLE();
4251     std::vector<cl_device_id> devices;
4252     cl_platform_id handle;
4253 };
4254
4255 PlatformInfo::PlatformInfo()
4256 {
4257     p = 0;
4258 }
4259
4260 PlatformInfo::PlatformInfo(void* platform_id)
4261 {
4262     p = new Impl(platform_id);
4263 }
4264
4265 PlatformInfo::~PlatformInfo()
4266 {
4267     if(p)
4268         p->release();
4269 }
4270
4271 PlatformInfo::PlatformInfo(const PlatformInfo& i)
4272 {
4273     if (i.p)
4274         i.p->addref();
4275     p = i.p;
4276 }
4277
4278 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
4279 {
4280     if (i.p != p)
4281     {
4282         if (i.p)
4283             i.p->addref();
4284         if (p)
4285             p->release();
4286         p = i.p;
4287     }
4288     return *this;
4289 }
4290
4291 int PlatformInfo::deviceNumber() const
4292 {
4293     return p ? (int)p->devices.size() : 0;
4294 }
4295
4296 void PlatformInfo::getDevice(Device& device, int d) const
4297 {
4298     CV_Assert(p && d < (int)p->devices.size() );
4299     if(p)
4300         device.set(p->devices[d]);
4301 }
4302
4303 String PlatformInfo::name() const
4304 {
4305     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
4306 }
4307
4308 String PlatformInfo::vendor() const
4309 {
4310     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
4311 }
4312
4313 String PlatformInfo::version() const
4314 {
4315     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
4316 }
4317
4318 static void getPlatforms(std::vector<cl_platform_id>& platforms)
4319 {
4320     cl_uint numPlatforms = 0;
4321     CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
4322
4323     if (numPlatforms == 0)
4324     {
4325         platforms.clear();
4326         return;
4327     }
4328
4329     platforms.resize((size_t)numPlatforms);
4330     CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
4331 }
4332
4333 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
4334 {
4335     std::vector<cl_platform_id> platforms;
4336     getPlatforms(platforms);
4337
4338     for (size_t i = 0; i < platforms.size(); i++)
4339         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
4340 }
4341
4342 const char* typeToStr(int type)
4343 {
4344     static const char* tab[]=
4345     {
4346         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4347         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4348         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4349         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4350         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4351         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
4352         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
4353         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4354     };
4355     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4356     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4357 }
4358
4359 const char* memopTypeToStr(int type)
4360 {
4361     static const char* tab[] =
4362     {
4363         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4364         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4365         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4366         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4367         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4368         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4369         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
4370         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4371     };
4372     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4373     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4374 }
4375
4376 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
4377 {
4378     if( sdepth == ddepth )
4379         return "noconvert";
4380     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
4381     if( ddepth >= CV_32F ||
4382         (ddepth == CV_32S && sdepth < CV_32S) ||
4383         (ddepth == CV_16S && sdepth <= CV_8S) ||
4384         (ddepth == CV_16U && sdepth == CV_8U))
4385     {
4386         sprintf(buf, "convert_%s", typestr);
4387     }
4388     else if( sdepth >= CV_32F )
4389         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
4390     else
4391         sprintf(buf, "convert_%s_sat", typestr);
4392
4393     return buf;
4394 }
4395
4396 template <typename T>
4397 static std::string kerToStr(const Mat & k)
4398 {
4399     int width = k.cols - 1, depth = k.depth();
4400     const T * const data = k.ptr<T>();
4401
4402     std::ostringstream stream;
4403     stream.precision(10);
4404
4405     if (depth <= CV_8S)
4406     {
4407         for (int i = 0; i < width; ++i)
4408             stream << "DIG(" << (int)data[i] << ")";
4409         stream << "DIG(" << (int)data[width] << ")";
4410     }
4411     else if (depth == CV_32F)
4412     {
4413         stream.setf(std::ios_base::showpoint);
4414         for (int i = 0; i < width; ++i)
4415             stream << "DIG(" << data[i] << "f)";
4416         stream << "DIG(" << data[width] << "f)";
4417     }
4418     else
4419     {
4420         for (int i = 0; i < width; ++i)
4421             stream << "DIG(" << data[i] << ")";
4422         stream << "DIG(" << data[width] << ")";
4423     }
4424
4425     return stream.str();
4426 }
4427
4428 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
4429 {
4430     Mat kernel = _kernel.getMat().reshape(1, 1);
4431
4432     int depth = kernel.depth();
4433     if (ddepth < 0)
4434         ddepth = depth;
4435
4436     if (ddepth != depth)
4437         kernel.convertTo(kernel, ddepth);
4438
4439     typedef std::string (* func_t)(const Mat &);
4440     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
4441                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
4442     const func_t func = funcs[ddepth];
4443     CV_Assert(func != 0);
4444
4445     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
4446 }
4447
4448 #define PROCESS_SRC(src) \
4449     do \
4450     { \
4451         if (!src.empty()) \
4452         { \
4453             CV_Assert(src.isMat() || src.isUMat()); \
4454             int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
4455             Size csize = src.size(); \
4456             cols.push_back(ccn * csize.width); \
4457             if (ctype != type) \
4458                 return 1; \
4459             offsets.push_back(src.offset()); \
4460             steps.push_back(src.step()); \
4461         } \
4462     } \
4463     while ((void)0, 0)
4464
4465 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
4466                               InputArray src4, InputArray src5, InputArray src6,
4467                               InputArray src7, InputArray src8, InputArray src9)
4468 {
4469     int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth);
4470     Size ssize = src1.size();
4471     const ocl::Device & d = ocl::Device::getDefault();
4472
4473     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
4474         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
4475         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
4476         d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
4477
4478     // if the device says don't use vectors
4479     if (vectorWidths[0] == 1)
4480     {
4481         // it's heuristic
4482         int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
4483         kercn = vectorWidthsOthers[depth];
4484     }
4485
4486     if (ssize.width * cn < kercn || kercn <= 0)
4487         return 1;
4488
4489     std::vector<size_t> offsets, steps, cols;
4490     PROCESS_SRC(src1);
4491     PROCESS_SRC(src2);
4492     PROCESS_SRC(src3);
4493     PROCESS_SRC(src4);
4494     PROCESS_SRC(src5);
4495     PROCESS_SRC(src6);
4496     PROCESS_SRC(src7);
4497     PROCESS_SRC(src8);
4498     PROCESS_SRC(src9);
4499
4500     size_t size = offsets.size();
4501     int wsz = kercn * esz1;
4502     std::vector<int> dividers(size, wsz);
4503
4504     for (size_t i = 0; i < size; ++i)
4505         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
4506             dividers[i] >>= 1;
4507
4508     // default strategy
4509     for (size_t i = 0; i < size; ++i)
4510         if (dividers[i] != wsz)
4511         {
4512             kercn = 1;
4513             break;
4514         }
4515
4516     // another strategy
4517 //    width = *std::min_element(dividers.begin(), dividers.end());
4518
4519     return kercn;
4520 }
4521
4522 #undef PROCESS_SRC
4523
4524
4525 // TODO Make this as a method of OpenCL "BuildOptions" class
4526 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
4527 {
4528     if (!buildOptions.empty())
4529         buildOptions += " ";
4530     int type = _m.type(), depth = CV_MAT_DEPTH(type);
4531     buildOptions += format(
4532             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
4533             name.c_str(), ocl::typeToStr(type),
4534             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
4535             name.c_str(), (int)CV_MAT_CN(type),
4536             name.c_str(), (int)CV_ELEM_SIZE(type),
4537             name.c_str(), (int)CV_ELEM_SIZE1(type),
4538             name.c_str(), (int)depth
4539             );
4540 }
4541
4542
4543 struct Image2D::Impl
4544 {
4545     Impl(const UMat &src, bool norm, bool alias)
4546     {
4547         handle = 0;
4548         refcount = 1;
4549         init(src, norm, alias);
4550     }
4551
4552     ~Impl()
4553     {
4554         if (handle)
4555             clReleaseMemObject(handle);
4556     }
4557
4558     static cl_image_format getImageFormat(int depth, int cn, bool norm)
4559     {
4560         cl_image_format format;
4561         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
4562                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
4563         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
4564                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
4565         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
4566
4567         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
4568         int channelOrder = channelOrders[cn];
4569         format.image_channel_data_type = (cl_channel_type)channelType;
4570         format.image_channel_order = (cl_channel_order)channelOrder;
4571         return format;
4572     }
4573
4574     static bool isFormatSupported(cl_image_format format)
4575     {
4576         cl_context context = (cl_context)Context::getDefault().ptr();
4577         // Figure out how many formats are supported by this context.
4578         cl_uint numFormats = 0;
4579         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4580                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
4581                                                 NULL, &numFormats);
4582         AutoBuffer<cl_image_format> formats(numFormats);
4583         err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4584                                          CL_MEM_OBJECT_IMAGE2D, numFormats,
4585                                          formats, NULL);
4586         CV_OclDbgAssert(err == CL_SUCCESS);
4587         for (cl_uint i = 0; i < numFormats; ++i)
4588         {
4589             if (!memcmp(&formats[i], &format, sizeof(format)))
4590             {
4591                 return true;
4592             }
4593         }
4594         return false;
4595     }
4596
4597     void init(const UMat &src, bool norm, bool alias)
4598     {
4599         CV_Assert(ocl::Device::getDefault().imageSupport());
4600
4601         int err, depth = src.depth(), cn = src.channels();
4602         CV_Assert(cn <= 4);
4603         cl_image_format format = getImageFormat(depth, cn, norm);
4604
4605         if (!isFormatSupported(format))
4606             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
4607
4608         cl_context context = (cl_context)Context::getDefault().ptr();
4609         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
4610
4611 #ifdef CL_VERSION_1_2
4612         // this enables backwards portability to
4613         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
4614         const Device & d = ocl::Device::getDefault();
4615         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
4616         CV_Assert(!alias || canCreateAlias(src));
4617         if (1 < major || (1 == major && 2 <= minor))
4618         {
4619             cl_image_desc desc;
4620             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
4621             desc.image_width      = src.cols;
4622             desc.image_height     = src.rows;
4623             desc.image_depth      = 0;
4624             desc.image_array_size = 1;
4625             desc.image_row_pitch  = alias ? src.step[0] : 0;
4626             desc.image_slice_pitch = 0;
4627             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
4628             desc.num_mip_levels   = 0;
4629             desc.num_samples      = 0;
4630             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
4631         }
4632         else
4633 #endif
4634         {
4635             CV_SUPPRESS_DEPRECATED_START
4636             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
4637             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
4638             CV_SUPPRESS_DEPRECATED_END
4639         }
4640         CV_OclDbgAssert(err == CL_SUCCESS);
4641
4642         size_t origin[] = { 0, 0, 0 };
4643         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
4644
4645         cl_mem devData;
4646         if (!alias && !src.isContinuous())
4647         {
4648             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
4649             CV_OclDbgAssert(err == CL_SUCCESS);
4650
4651             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
4652             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
4653                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
4654             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4655         }
4656         else
4657         {
4658             devData = (cl_mem)src.handle(ACCESS_READ);
4659         }
4660         CV_Assert(devData != NULL);
4661
4662         if (!alias)
4663         {
4664             CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
4665             if (!src.isContinuous())
4666             {
4667                 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4668                 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
4669             }
4670         }
4671     }
4672
4673     IMPLEMENT_REFCOUNTABLE();
4674
4675     cl_mem handle;
4676 };
4677
4678 Image2D::Image2D()
4679 {
4680     p = NULL;
4681 }
4682
4683 Image2D::Image2D(const UMat &src, bool norm, bool alias)
4684 {
4685     p = new Impl(src, norm, alias);
4686 }
4687
4688 bool Image2D::canCreateAlias(const UMat &m)
4689 {
4690     bool ret = false;
4691     const Device & d = ocl::Device::getDefault();
4692     if (d.imageFromBufferSupport())
4693     {
4694         // This is the required pitch alignment in pixels
4695         uint pitchAlign = d.imagePitchAlignment();
4696         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
4697         {
4698             // We don't currently handle the case where the buffer was created
4699             // with CL_MEM_USE_HOST_PTR
4700             if (!m.u->tempUMat())
4701             {
4702                 ret = true;
4703             }
4704         }
4705     }
4706     return ret;
4707 }
4708
4709 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
4710 {
4711     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
4712
4713     return Impl::isFormatSupported(format);
4714 }
4715
4716 Image2D::Image2D(const Image2D & i)
4717 {
4718     p = i.p;
4719     if (p)
4720         p->addref();
4721 }
4722
4723 Image2D & Image2D::operator = (const Image2D & i)
4724 {
4725     if (i.p != p)
4726     {
4727         if (i.p)
4728             i.p->addref();
4729         if (p)
4730             p->release();
4731         p = i.p;
4732     }
4733     return *this;
4734 }
4735
4736 Image2D::~Image2D()
4737 {
4738     if (p)
4739         p->release();
4740 }
4741
4742 void* Image2D::ptr() const
4743 {
4744     return p ? p->handle : 0;
4745 }
4746
4747 bool isPerformanceCheckBypassed()
4748 {
4749     static bool initialized = false;
4750     static bool value = false;
4751     if (!initialized)
4752     {
4753         value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
4754         initialized = true;
4755     }
4756     return value;
4757 }
4758
4759 }}