5e18e035ec981a2af00f7a06e05571e39b73a451
[profile/ivi/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <sstream>
47 #include <iostream> // std::cerr
48
49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50 #define CV_OPENCL_SHOW_RUN_ERRORS       0
51
52 #include "opencv2/core/bufferpool.hpp"
53 #ifndef LOG_BUFFER_POOL
54 # if 0
55 #   define LOG_BUFFER_POOL printf
56 # else
57 #   define LOG_BUFFER_POOL(...)
58 # endif
59 #endif
60
61
62 // TODO Move to some common place
63 static bool getBoolParameter(const char* name, bool defaultValue)
64 {
65     const char* envValue = getenv(name);
66     if (envValue == NULL)
67     {
68         return defaultValue;
69     }
70     cv::String value = envValue;
71     if (value == "1" || value == "True" || value == "true" || value == "TRUE")
72     {
73         return true;
74     }
75     if (value == "0" || value == "False" || value == "false" || value == "FALSE")
76     {
77         return false;
78     }
79     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
80 }
81
82
83 // TODO Move to some common place
84 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
85 {
86 #ifdef HAVE_WINRT
87     const char* envValue = NULL;
88 #else
89     const char* envValue = getenv(name);
90 #endif
91     if (envValue == NULL)
92     {
93         return defaultValue;
94     }
95     cv::String value = envValue;
96     size_t pos = 0;
97     for (; pos < value.size(); pos++)
98     {
99         if (!isdigit(value[pos]))
100             break;
101     }
102     cv::String valueStr = value.substr(0, pos);
103     cv::String suffixStr = value.substr(pos, value.length() - pos);
104     int v = atoi(valueStr.c_str());
105     if (suffixStr.length() == 0)
106         return v;
107     else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
108         return v * 1024 * 1024;
109     else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
110         return v * 1024;
111     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
112 }
113
114 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
115 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
116
117 #ifdef HAVE_OPENCL
118 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
119 #else
120 // TODO FIXIT: This file can't be build without OPENCL
121
122 /*
123   Part of the file is an extract from the standard OpenCL headers from Khronos site.
124   Below is the original copyright.
125 */
126
127 /*******************************************************************************
128  * Copyright (c) 2008 - 2012 The Khronos Group Inc.
129  *
130  * Permission is hereby granted, free of charge, to any person obtaining a
131  * copy of this software and/or associated documentation files (the
132  * "Materials"), to deal in the Materials without restriction, including
133  * without limitation the rights to use, copy, modify, merge, publish,
134  * distribute, sublicense, and/or sell copies of the Materials, and to
135  * permit persons to whom the Materials are furnished to do so, subject to
136  * the following conditions:
137  *
138  * The above copyright notice and this permission notice shall be included
139  * in all copies or substantial portions of the Materials.
140  *
141  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
142  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
143  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
144  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
145  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
146  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
147  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
148  ******************************************************************************/
149
150 #if 0 //defined __APPLE__
151 #define HAVE_OPENCL 1
152 #else
153 #undef HAVE_OPENCL
154 #endif
155
156 #define OPENCV_CL_NOT_IMPLEMENTED -1000
157
158 #ifdef HAVE_OPENCL
159
160 #if defined __APPLE__
161 #include <OpenCL/opencl.h>
162 #else
163 #include <CL/opencl.h>
164 #endif
165
166 static const bool g_haveOpenCL = true;
167
168 #else
169
170 extern "C" {
171
172 struct _cl_platform_id { int dummy; };
173 struct _cl_device_id { int dummy; };
174 struct _cl_context { int dummy; };
175 struct _cl_command_queue { int dummy; };
176 struct _cl_mem { int dummy; };
177 struct _cl_program { int dummy; };
178 struct _cl_kernel { int dummy; };
179 struct _cl_event { int dummy; };
180 struct _cl_sampler { int dummy; };
181
182 typedef struct _cl_platform_id *    cl_platform_id;
183 typedef struct _cl_device_id *      cl_device_id;
184 typedef struct _cl_context *        cl_context;
185 typedef struct _cl_command_queue *  cl_command_queue;
186 typedef struct _cl_mem *            cl_mem;
187 typedef struct _cl_program *        cl_program;
188 typedef struct _cl_kernel *         cl_kernel;
189 typedef struct _cl_event *          cl_event;
190 typedef struct _cl_sampler *        cl_sampler;
191
192 typedef int cl_int;
193 typedef unsigned cl_uint;
194 #if defined (_WIN32) && defined(_MSC_VER)
195     typedef __int64 cl_long;
196     typedef unsigned __int64 cl_ulong;
197 #else
198     typedef long cl_long;
199     typedef unsigned long cl_ulong;
200 #endif
201
202 typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
203 typedef cl_ulong            cl_bitfield;
204 typedef cl_bitfield         cl_device_type;
205 typedef cl_uint             cl_platform_info;
206 typedef cl_uint             cl_device_info;
207 typedef cl_bitfield         cl_device_fp_config;
208 typedef cl_uint             cl_device_mem_cache_type;
209 typedef cl_uint             cl_device_local_mem_type;
210 typedef cl_bitfield         cl_device_exec_capabilities;
211 typedef cl_bitfield         cl_command_queue_properties;
212 typedef intptr_t            cl_device_partition_property;
213 typedef cl_bitfield         cl_device_affinity_domain;
214
215 typedef intptr_t            cl_context_properties;
216 typedef cl_uint             cl_context_info;
217 typedef cl_uint             cl_command_queue_info;
218 typedef cl_uint             cl_channel_order;
219 typedef cl_uint             cl_channel_type;
220 typedef cl_bitfield         cl_mem_flags;
221 typedef cl_uint             cl_mem_object_type;
222 typedef cl_uint             cl_mem_info;
223 typedef cl_bitfield         cl_mem_migration_flags;
224 typedef cl_uint             cl_image_info;
225 typedef cl_uint             cl_buffer_create_type;
226 typedef cl_uint             cl_addressing_mode;
227 typedef cl_uint             cl_filter_mode;
228 typedef cl_uint             cl_sampler_info;
229 typedef cl_bitfield         cl_map_flags;
230 typedef cl_uint             cl_program_info;
231 typedef cl_uint             cl_program_build_info;
232 typedef cl_uint             cl_program_binary_type;
233 typedef cl_int              cl_build_status;
234 typedef cl_uint             cl_kernel_info;
235 typedef cl_uint             cl_kernel_arg_info;
236 typedef cl_uint             cl_kernel_arg_address_qualifier;
237 typedef cl_uint             cl_kernel_arg_access_qualifier;
238 typedef cl_bitfield         cl_kernel_arg_type_qualifier;
239 typedef cl_uint             cl_kernel_work_group_info;
240 typedef cl_uint             cl_event_info;
241 typedef cl_uint             cl_command_type;
242 typedef cl_uint             cl_profiling_info;
243
244
245 typedef struct _cl_image_format {
246     cl_channel_order        image_channel_order;
247     cl_channel_type         image_channel_data_type;
248 } cl_image_format;
249
250 typedef struct _cl_image_desc {
251     cl_mem_object_type      image_type;
252     size_t                  image_width;
253     size_t                  image_height;
254     size_t                  image_depth;
255     size_t                  image_array_size;
256     size_t                  image_row_pitch;
257     size_t                  image_slice_pitch;
258     cl_uint                 num_mip_levels;
259     cl_uint                 num_samples;
260     cl_mem                  buffer;
261 } cl_image_desc;
262
263 typedef struct _cl_buffer_region {
264     size_t                  origin;
265     size_t                  size;
266 } cl_buffer_region;
267
268
269 //////////////////////////////////////////////////////////
270
271 #define CL_SUCCESS                                  0
272 #define CL_DEVICE_NOT_FOUND                         -1
273 #define CL_DEVICE_NOT_AVAILABLE                     -2
274 #define CL_COMPILER_NOT_AVAILABLE                   -3
275 #define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
276 #define CL_OUT_OF_RESOURCES                         -5
277 #define CL_OUT_OF_HOST_MEMORY                       -6
278 #define CL_PROFILING_INFO_NOT_AVAILABLE             -7
279 #define CL_MEM_COPY_OVERLAP                         -8
280 #define CL_IMAGE_FORMAT_MISMATCH                    -9
281 #define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
282 #define CL_BUILD_PROGRAM_FAILURE                    -11
283 #define CL_MAP_FAILURE                              -12
284 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
285 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
286 #define CL_COMPILE_PROGRAM_FAILURE                  -15
287 #define CL_LINKER_NOT_AVAILABLE                     -16
288 #define CL_LINK_PROGRAM_FAILURE                     -17
289 #define CL_DEVICE_PARTITION_FAILED                  -18
290 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
291
292 #define CL_INVALID_VALUE                            -30
293 #define CL_INVALID_DEVICE_TYPE                      -31
294 #define CL_INVALID_PLATFORM                         -32
295 #define CL_INVALID_DEVICE                           -33
296 #define CL_INVALID_CONTEXT                          -34
297 #define CL_INVALID_QUEUE_PROPERTIES                 -35
298 #define CL_INVALID_COMMAND_QUEUE                    -36
299 #define CL_INVALID_HOST_PTR                         -37
300 #define CL_INVALID_MEM_OBJECT                       -38
301 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
302 #define CL_INVALID_IMAGE_SIZE                       -40
303 #define CL_INVALID_SAMPLER                          -41
304 #define CL_INVALID_BINARY                           -42
305 #define CL_INVALID_BUILD_OPTIONS                    -43
306 #define CL_INVALID_PROGRAM                          -44
307 #define CL_INVALID_PROGRAM_EXECUTABLE               -45
308 #define CL_INVALID_KERNEL_NAME                      -46
309 #define CL_INVALID_KERNEL_DEFINITION                -47
310 #define CL_INVALID_KERNEL                           -48
311 #define CL_INVALID_ARG_INDEX                        -49
312 #define CL_INVALID_ARG_VALUE                        -50
313 #define CL_INVALID_ARG_SIZE                         -51
314 #define CL_INVALID_KERNEL_ARGS                      -52
315 #define CL_INVALID_WORK_DIMENSION                   -53
316 #define CL_INVALID_WORK_GROUP_SIZE                  -54
317 #define CL_INVALID_WORK_ITEM_SIZE                   -55
318 #define CL_INVALID_GLOBAL_OFFSET                    -56
319 #define CL_INVALID_EVENT_WAIT_LIST                  -57
320 #define CL_INVALID_EVENT                            -58
321 #define CL_INVALID_OPERATION                        -59
322 #define CL_INVALID_GL_OBJECT                        -60
323 #define CL_INVALID_BUFFER_SIZE                      -61
324 #define CL_INVALID_MIP_LEVEL                        -62
325 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
326 #define CL_INVALID_PROPERTY                         -64
327 #define CL_INVALID_IMAGE_DESCRIPTOR                 -65
328 #define CL_INVALID_COMPILER_OPTIONS                 -66
329 #define CL_INVALID_LINKER_OPTIONS                   -67
330 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
331
332 /*#define CL_VERSION_1_0                              1
333 #define CL_VERSION_1_1                              1
334 #define CL_VERSION_1_2                              1*/
335
336 #define CL_FALSE                                    0
337 #define CL_TRUE                                     1
338 #define CL_BLOCKING                                 CL_TRUE
339 #define CL_NON_BLOCKING                             CL_FALSE
340
341 #define CL_PLATFORM_PROFILE                         0x0900
342 #define CL_PLATFORM_VERSION                         0x0901
343 #define CL_PLATFORM_NAME                            0x0902
344 #define CL_PLATFORM_VENDOR                          0x0903
345 #define CL_PLATFORM_EXTENSIONS                      0x0904
346
347 #define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
348 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
349 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
350 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
351 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
352 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
353 #define CL_DEVICE_TYPE                              0x1000
354 #define CL_DEVICE_VENDOR_ID                         0x1001
355 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
356 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
357 #define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
358 #define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
359 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
360 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
361 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
362 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
363 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
364 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
365 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
366 #define CL_DEVICE_ADDRESS_BITS                      0x100D
367 #define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
368 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
369 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
370 #define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
371 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
372 #define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
373 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
374 #define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
375 #define CL_DEVICE_IMAGE_SUPPORT                     0x1016
376 #define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
377 #define CL_DEVICE_MAX_SAMPLERS                      0x1018
378 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
379 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
380 #define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
381 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
382 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
383 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
384 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
385 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
386 #define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
387 #define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
388 #define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
389 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
390 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
391 #define CL_DEVICE_ENDIAN_LITTLE                     0x1026
392 #define CL_DEVICE_AVAILABLE                         0x1027
393 #define CL_DEVICE_COMPILER_AVAILABLE                0x1028
394 #define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
395 #define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
396 #define CL_DEVICE_NAME                              0x102B
397 #define CL_DEVICE_VENDOR                            0x102C
398 #define CL_DRIVER_VERSION                           0x102D
399 #define CL_DEVICE_PROFILE                           0x102E
400 #define CL_DEVICE_VERSION                           0x102F
401 #define CL_DEVICE_EXTENSIONS                        0x1030
402 #define CL_DEVICE_PLATFORM                          0x1031
403 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
404 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
405 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
406 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
407 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
408 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
409 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
410 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
411 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
412 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
413 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
414 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
415 #define CL_DEVICE_LINKER_AVAILABLE                  0x103E
416 #define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
417 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
418 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
419 #define CL_DEVICE_PARENT_DEVICE                     0x1042
420 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
421 #define CL_DEVICE_PARTITION_PROPERTIES              0x1044
422 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
423 #define CL_DEVICE_PARTITION_TYPE                    0x1046
424 #define CL_DEVICE_REFERENCE_COUNT                   0x1047
425 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
426 #define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
427 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
428 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
429
430 #define CL_FP_DENORM                                (1 << 0)
431 #define CL_FP_INF_NAN                               (1 << 1)
432 #define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
433 #define CL_FP_ROUND_TO_ZERO                         (1 << 3)
434 #define CL_FP_ROUND_TO_INF                          (1 << 4)
435 #define CL_FP_FMA                                   (1 << 5)
436 #define CL_FP_SOFT_FLOAT                            (1 << 6)
437 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
438
439 #define CL_NONE                                     0x0
440 #define CL_READ_ONLY_CACHE                          0x1
441 #define CL_READ_WRITE_CACHE                         0x2
442 #define CL_LOCAL                                    0x1
443 #define CL_GLOBAL                                   0x2
444 #define CL_EXEC_KERNEL                              (1 << 0)
445 #define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
446 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
447 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
448
449 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
450 #define CL_CONTEXT_DEVICES                          0x1081
451 #define CL_CONTEXT_PROPERTIES                       0x1082
452 #define CL_CONTEXT_NUM_DEVICES                      0x1083
453 #define CL_CONTEXT_PLATFORM                         0x1084
454 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
455
456 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
457 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
458 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
459 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
460 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
461 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
462 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
463 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
464 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
465 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
466 #define CL_QUEUE_CONTEXT                            0x1090
467 #define CL_QUEUE_DEVICE                             0x1091
468 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
469 #define CL_QUEUE_PROPERTIES                         0x1093
470 #define CL_MEM_READ_WRITE                           (1 << 0)
471 #define CL_MEM_WRITE_ONLY                           (1 << 1)
472 #define CL_MEM_READ_ONLY                            (1 << 2)
473 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
474 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
475 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
476 // reserved                                         (1 << 6)
477 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
478 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
479 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
480 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
481 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
482
483 #define CL_R                                        0x10B0
484 #define CL_A                                        0x10B1
485 #define CL_RG                                       0x10B2
486 #define CL_RA                                       0x10B3
487 #define CL_RGB                                      0x10B4
488 #define CL_RGBA                                     0x10B5
489 #define CL_BGRA                                     0x10B6
490 #define CL_ARGB                                     0x10B7
491 #define CL_INTENSITY                                0x10B8
492 #define CL_LUMINANCE                                0x10B9
493 #define CL_Rx                                       0x10BA
494 #define CL_RGx                                      0x10BB
495 #define CL_RGBx                                     0x10BC
496 #define CL_DEPTH                                    0x10BD
497 #define CL_DEPTH_STENCIL                            0x10BE
498
499 #define CL_SNORM_INT8                               0x10D0
500 #define CL_SNORM_INT16                              0x10D1
501 #define CL_UNORM_INT8                               0x10D2
502 #define CL_UNORM_INT16                              0x10D3
503 #define CL_UNORM_SHORT_565                          0x10D4
504 #define CL_UNORM_SHORT_555                          0x10D5
505 #define CL_UNORM_INT_101010                         0x10D6
506 #define CL_SIGNED_INT8                              0x10D7
507 #define CL_SIGNED_INT16                             0x10D8
508 #define CL_SIGNED_INT32                             0x10D9
509 #define CL_UNSIGNED_INT8                            0x10DA
510 #define CL_UNSIGNED_INT16                           0x10DB
511 #define CL_UNSIGNED_INT32                           0x10DC
512 #define CL_HALF_FLOAT                               0x10DD
513 #define CL_FLOAT                                    0x10DE
514 #define CL_UNORM_INT24                              0x10DF
515
516 #define CL_MEM_OBJECT_BUFFER                        0x10F0
517 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
518 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
519 #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
520 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
521 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
522 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
523
524 #define CL_MEM_TYPE                                 0x1100
525 #define CL_MEM_FLAGS                                0x1101
526 #define CL_MEM_SIZE                                 0x1102
527 #define CL_MEM_HOST_PTR                             0x1103
528 #define CL_MEM_MAP_COUNT                            0x1104
529 #define CL_MEM_REFERENCE_COUNT                      0x1105
530 #define CL_MEM_CONTEXT                              0x1106
531 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
532 #define CL_MEM_OFFSET                               0x1108
533
534 #define CL_IMAGE_FORMAT                             0x1110
535 #define CL_IMAGE_ELEMENT_SIZE                       0x1111
536 #define CL_IMAGE_ROW_PITCH                          0x1112
537 #define CL_IMAGE_SLICE_PITCH                        0x1113
538 #define CL_IMAGE_WIDTH                              0x1114
539 #define CL_IMAGE_HEIGHT                             0x1115
540 #define CL_IMAGE_DEPTH                              0x1116
541 #define CL_IMAGE_ARRAY_SIZE                         0x1117
542 #define CL_IMAGE_BUFFER                             0x1118
543 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
544 #define CL_IMAGE_NUM_SAMPLES                        0x111A
545
546 #define CL_ADDRESS_NONE                             0x1130
547 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
548 #define CL_ADDRESS_CLAMP                            0x1132
549 #define CL_ADDRESS_REPEAT                           0x1133
550 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
551
552 #define CL_FILTER_NEAREST                           0x1140
553 #define CL_FILTER_LINEAR                            0x1141
554
555 #define CL_SAMPLER_REFERENCE_COUNT                  0x1150
556 #define CL_SAMPLER_CONTEXT                          0x1151
557 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
558 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
559 #define CL_SAMPLER_FILTER_MODE                      0x1154
560
561 #define CL_MAP_READ                                 (1 << 0)
562 #define CL_MAP_WRITE                                (1 << 1)
563 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
564
565 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
566 #define CL_PROGRAM_CONTEXT                          0x1161
567 #define CL_PROGRAM_NUM_DEVICES                      0x1162
568 #define CL_PROGRAM_DEVICES                          0x1163
569 #define CL_PROGRAM_SOURCE                           0x1164
570 #define CL_PROGRAM_BINARY_SIZES                     0x1165
571 #define CL_PROGRAM_BINARIES                         0x1166
572 #define CL_PROGRAM_NUM_KERNELS                      0x1167
573 #define CL_PROGRAM_KERNEL_NAMES                     0x1168
574 #define CL_PROGRAM_BUILD_STATUS                     0x1181
575 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
576 #define CL_PROGRAM_BUILD_LOG                        0x1183
577 #define CL_PROGRAM_BINARY_TYPE                      0x1184
578 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
579 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
580 #define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
581 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
582
583 #define CL_BUILD_SUCCESS                            0
584 #define CL_BUILD_NONE                               -1
585 #define CL_BUILD_ERROR                              -2
586 #define CL_BUILD_IN_PROGRESS                        -3
587
588 #define CL_KERNEL_FUNCTION_NAME                     0x1190
589 #define CL_KERNEL_NUM_ARGS                          0x1191
590 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
591 #define CL_KERNEL_CONTEXT                           0x1193
592 #define CL_KERNEL_PROGRAM                           0x1194
593 #define CL_KERNEL_ATTRIBUTES                        0x1195
594 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
595 #define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
596 #define CL_KERNEL_ARG_TYPE_NAME                     0x1198
597 #define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
598 #define CL_KERNEL_ARG_NAME                          0x119A
599 #define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
600 #define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
601 #define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
602 #define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
603 #define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
604 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
605 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
606 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
607 #define CL_KERNEL_ARG_TYPE_NONE                     0
608 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
609 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
610 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
611 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
612 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
613 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
614 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
615 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
616 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
617
618 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
619 #define CL_EVENT_COMMAND_TYPE                       0x11D1
620 #define CL_EVENT_REFERENCE_COUNT                    0x11D2
621 #define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
622 #define CL_EVENT_CONTEXT                            0x11D4
623
624 #define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
625 #define CL_COMMAND_TASK                             0x11F1
626 #define CL_COMMAND_NATIVE_KERNEL                    0x11F2
627 #define CL_COMMAND_READ_BUFFER                      0x11F3
628 #define CL_COMMAND_WRITE_BUFFER                     0x11F4
629 #define CL_COMMAND_COPY_BUFFER                      0x11F5
630 #define CL_COMMAND_READ_IMAGE                       0x11F6
631 #define CL_COMMAND_WRITE_IMAGE                      0x11F7
632 #define CL_COMMAND_COPY_IMAGE                       0x11F8
633 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
634 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
635 #define CL_COMMAND_MAP_BUFFER                       0x11FB
636 #define CL_COMMAND_MAP_IMAGE                        0x11FC
637 #define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
638 #define CL_COMMAND_MARKER                           0x11FE
639 #define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
640 #define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
641 #define CL_COMMAND_READ_BUFFER_RECT                 0x1201
642 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
643 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
644 #define CL_COMMAND_USER                             0x1204
645 #define CL_COMMAND_BARRIER                          0x1205
646 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
647 #define CL_COMMAND_FILL_BUFFER                      0x1207
648 #define CL_COMMAND_FILL_IMAGE                       0x1208
649
650 #define CL_COMPLETE                                 0x0
651 #define CL_RUNNING                                  0x1
652 #define CL_SUBMITTED                                0x2
653 #define CL_QUEUED                                   0x3
654 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
655
656 #define CL_PROFILING_COMMAND_QUEUED                 0x1280
657 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
658 #define CL_PROFILING_COMMAND_START                  0x1282
659 #define CL_PROFILING_COMMAND_END                    0x1283
660
661 #define CL_CALLBACK CV_STDCALL
662
663 static volatile bool g_haveOpenCL = false;
664 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
665
666 #if defined(__APPLE__)
667 #include <dlfcn.h>
668
669 static void* initOpenCLAndLoad(const char* funcname)
670 {
671     static bool initialized = false;
672     static void* handle = 0;
673     if (!handle)
674     {
675         if(!initialized)
676         {
677             const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
678             oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
679                 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
680             handle = dlopen(oclpath, RTLD_LAZY);
681             initialized = true;
682             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
683             if( g_haveOpenCL )
684                 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
685             else
686                 fprintf(stderr, "Failed to load OpenCL runtime\n");
687         }
688         if(!handle)
689             return 0;
690     }
691
692     return funcname && handle ? dlsym(handle, funcname) : 0;
693 }
694
695 #elif defined WIN32 || defined _WIN32
696
697 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
698   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
699 #endif
700 #include <windows.h>
701 #if (_WIN32_WINNT >= 0x0602)
702   #include <synchapi.h>
703 #endif
704 #undef small
705 #undef min
706 #undef max
707 #undef abs
708
709 static void* initOpenCLAndLoad(const char* funcname)
710 {
711     static bool initialized = false;
712     static HMODULE handle = 0;
713     if (!handle)
714     {
715 #ifndef HAVE_WINRT
716         if(!initialized)
717         {
718             handle = LoadLibraryA("OpenCL.dll");
719             initialized = true;
720             g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
721         }
722 #endif
723         if(!handle)
724             return 0;
725     }
726
727     return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
728 }
729
730 #elif defined(__linux)
731
732 #include <dlfcn.h>
733 #include <stdio.h>
734
735 static void* initOpenCLAndLoad(const char* funcname)
736 {
737     static bool initialized = false;
738     static void* handle = 0;
739     if (!handle)
740     {
741         if(!initialized)
742         {
743             handle = dlopen("libOpenCL.so", RTLD_LAZY);
744             if(!handle)
745                 handle = dlopen("libCL.so", RTLD_LAZY);
746             initialized = true;
747             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
748         }
749         if(!handle)
750             return 0;
751     }
752
753     return funcname ? (void*)dlsym(handle, funcname) : 0;
754 }
755
756 #else
757
758 static void* initOpenCLAndLoad(const char*)
759 {
760     return 0;
761 }
762
763 #endif
764
765
766 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
767     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
768     static rettype funcname argsdecl \
769     { \
770         static funcname##_t funcname##_p = 0; \
771         if( !funcname##_p ) \
772         { \
773             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
774             if( !funcname##_p ) \
775                 return OPENCV_CL_NOT_IMPLEMENTED; \
776         } \
777         return funcname##_p args; \
778     }
779
780
781 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
782     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
783     static rettype funcname argsdecl \
784     { \
785         static funcname##_t funcname##_p = 0; \
786         if( !funcname##_p ) \
787         { \
788             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
789             if( !funcname##_p ) \
790             { \
791                 if( errcode_ret ) \
792                     *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
793                 return 0; \
794             } \
795         } \
796         return funcname##_p args; \
797     }
798
799 OCL_FUNC(cl_int, clGetPlatformIDs,
800     (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
801     (num_entries, platforms, num_platforms))
802
803 OCL_FUNC(cl_int, clGetPlatformInfo,
804     (cl_platform_id platform, cl_platform_info param_name,
805     size_t param_value_size, void * param_value,
806     size_t * param_value_size_ret),
807     (platform, param_name, param_value_size, param_value, param_value_size_ret))
808
809 OCL_FUNC(cl_int, clGetDeviceInfo,
810          (cl_device_id device,
811           cl_device_info param_name,
812           size_t param_value_size,
813           void * param_value,
814           size_t * param_value_size_ret),
815          (device, param_name, param_value_size, param_value, param_value_size_ret))
816
817
818 OCL_FUNC(cl_int, clGetDeviceIDs,
819     (cl_platform_id platform,
820     cl_device_type device_type,
821     cl_uint num_entries,
822     cl_device_id * devices,
823     cl_uint * num_devices),
824     (platform, device_type, num_entries, devices, num_devices))
825
826 OCL_FUNC_P(cl_context, clCreateContext,
827     (const cl_context_properties * properties,
828     cl_uint num_devices,
829     const cl_device_id * devices,
830     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
831     void * user_data,
832     cl_int * errcode_ret),
833     (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
834
835 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
836
837 /*
838 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
839
840 OCL_FUNC_P(cl_context, clCreateContextFromType,
841     (const cl_context_properties * properties,
842     cl_device_type device_type,
843     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
844     void * user_data,
845     cl_int * errcode_ret),
846     (properties, device_type, pfn_notify, user_data, errcode_ret))
847
848 OCL_FUNC(cl_int, clGetContextInfo,
849     (cl_context context,
850     cl_context_info param_name,
851     size_t param_value_size,
852     void * param_value,
853     size_t * param_value_size_ret),
854     (context, param_name, param_value_size,
855     param_value, param_value_size_ret))
856 */
857 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
858     (cl_context context,
859     cl_device_id device,
860     cl_command_queue_properties properties,
861     cl_int * errcode_ret),
862     (context, device, properties, errcode_ret))
863
864 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
865
866 OCL_FUNC_P(cl_mem, clCreateBuffer,
867     (cl_context context,
868     cl_mem_flags flags,
869     size_t size,
870     void * host_ptr,
871     cl_int * errcode_ret),
872     (context, flags, size, host_ptr, errcode_ret))
873
874 /*
875 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
876
877 OCL_FUNC(cl_int, clGetCommandQueueInfo,
878  (cl_command_queue command_queue,
879  cl_command_queue_info param_name,
880  size_t param_value_size,
881  void * param_value,
882  size_t * param_value_size_ret),
883  (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
884
885 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
886     (cl_mem buffer,
887     cl_mem_flags flags,
888     cl_buffer_create_type buffer_create_type,
889     const void * buffer_create_info,
890     cl_int * errcode_ret),
891     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
892 */
893
894 OCL_FUNC_P(cl_mem, clCreateImage,
895     (cl_context context,
896     cl_mem_flags flags,
897     const cl_image_format * image_format,
898     const cl_image_desc * image_desc,
899     void * host_ptr,
900     cl_int * errcode_ret),
901     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
902
903 OCL_FUNC_P(cl_mem, clCreateImage2D,
904     (cl_context context,
905     cl_mem_flags flags,
906     const cl_image_format * image_format,
907     size_t image_width,
908     size_t image_height,
909     size_t image_row_pitch,
910     void * host_ptr,
911     cl_int *errcode_ret),
912     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
913
914 OCL_FUNC(cl_int, clGetSupportedImageFormats,
915  (cl_context context,
916  cl_mem_flags flags,
917  cl_mem_object_type image_type,
918  cl_uint num_entries,
919  cl_image_format * image_formats,
920  cl_uint * num_image_formats),
921  (context, flags, image_type, num_entries, image_formats, num_image_formats))
922
923 /*
924 OCL_FUNC(cl_int, clGetMemObjectInfo,
925  (cl_mem memobj,
926  cl_mem_info param_name,
927  size_t param_value_size,
928  void * param_value,
929  size_t * param_value_size_ret),
930  (memobj, param_name, param_value_size, param_value, param_value_size_ret))
931
932 OCL_FUNC(cl_int, clGetImageInfo,
933  (cl_mem image,
934  cl_image_info param_name,
935  size_t param_value_size,
936  void * param_value,
937  size_t * param_value_size_ret),
938  (image, param_name, param_value_size, param_value, param_value_size_ret))
939
940 OCL_FUNC(cl_int, clCreateKernelsInProgram,
941  (cl_program program,
942  cl_uint num_kernels,
943  cl_kernel * kernels,
944  cl_uint * num_kernels_ret),
945  (program, num_kernels, kernels, num_kernels_ret))
946
947 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
948
949 OCL_FUNC(cl_int, clGetKernelArgInfo,
950  (cl_kernel kernel,
951  cl_uint arg_indx,
952  cl_kernel_arg_info param_name,
953  size_t param_value_size,
954  void * param_value,
955  size_t * param_value_size_ret),
956  (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
957
958 OCL_FUNC(cl_int, clEnqueueReadImage,
959  (cl_command_queue command_queue,
960  cl_mem image,
961  cl_bool blocking_read,
962  const size_t * origin[3],
963  const size_t * region[3],
964  size_t row_pitch,
965  size_t slice_pitch,
966  void * ptr,
967  cl_uint num_events_in_wait_list,
968  const cl_event * event_wait_list,
969  cl_event * event),
970  (command_queue, image, blocking_read, origin, region,
971  row_pitch, slice_pitch,
972  ptr,
973  num_events_in_wait_list,
974  event_wait_list,
975  event))
976
977 OCL_FUNC(cl_int, clEnqueueWriteImage,
978  (cl_command_queue command_queue,
979  cl_mem image,
980  cl_bool blocking_write,
981  const size_t * origin[3],
982  const size_t * region[3],
983  size_t input_row_pitch,
984  size_t input_slice_pitch,
985  const void * ptr,
986  cl_uint num_events_in_wait_list,
987  const cl_event * event_wait_list,
988  cl_event * event),
989  (command_queue, image, blocking_write, origin, region, input_row_pitch,
990  input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
991
992 OCL_FUNC(cl_int, clEnqueueFillImage,
993  (cl_command_queue command_queue,
994  cl_mem image,
995  const void * fill_color,
996  const size_t * origin[3],
997  const size_t * region[3],
998  cl_uint num_events_in_wait_list,
999  const cl_event * event_wait_list,
1000  cl_event * event),
1001  (command_queue, image, fill_color, origin, region,
1002  num_events_in_wait_list, event_wait_list, event))
1003
1004 OCL_FUNC(cl_int, clEnqueueCopyImage,
1005  (cl_command_queue command_queue,
1006  cl_mem src_image,
1007  cl_mem dst_image,
1008  const size_t * src_origin[3],
1009  const size_t * dst_origin[3],
1010  const size_t * region[3],
1011  cl_uint num_events_in_wait_list,
1012  const cl_event * event_wait_list,
1013  cl_event * event),
1014  (command_queue, src_image, dst_image, src_origin, dst_origin,
1015  region, num_events_in_wait_list, event_wait_list, event))
1016
1017 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
1018  (cl_command_queue command_queue,
1019  cl_mem src_image,
1020  cl_mem dst_buffer,
1021  const size_t * src_origin[3],
1022  const size_t * region[3],
1023  size_t dst_offset,
1024  cl_uint num_events_in_wait_list,
1025  const cl_event * event_wait_list,
1026  cl_event * event),
1027  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1028  num_events_in_wait_list, event_wait_list, event))
1029 */
1030
1031 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1032  (cl_command_queue command_queue,
1033  cl_mem src_buffer,
1034  cl_mem dst_image,
1035  size_t src_offset,
1036  const size_t dst_origin[3],
1037  const size_t region[3],
1038  cl_uint num_events_in_wait_list,
1039  const cl_event * event_wait_list,
1040  cl_event * event),
1041  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1042  region, num_events_in_wait_list, event_wait_list, event))
1043
1044  OCL_FUNC(cl_int, clFlush,
1045  (cl_command_queue command_queue),
1046  (command_queue))
1047
1048 /*
1049 OCL_FUNC_P(void*, clEnqueueMapImage,
1050  (cl_command_queue command_queue,
1051  cl_mem image,
1052  cl_bool blocking_map,
1053  cl_map_flags map_flags,
1054  const size_t * origin[3],
1055  const size_t * region[3],
1056  size_t * image_row_pitch,
1057  size_t * image_slice_pitch,
1058  cl_uint num_events_in_wait_list,
1059  const cl_event * event_wait_list,
1060  cl_event * event,
1061  cl_int * errcode_ret),
1062  (command_queue, image, blocking_map, map_flags, origin, region,
1063  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1064  event_wait_list, event, errcode_ret))
1065 */
1066
1067 /*
1068 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1069
1070 OCL_FUNC(cl_int, clGetKernelInfo,
1071  (cl_kernel kernel,
1072  cl_kernel_info param_name,
1073  size_t param_value_size,
1074  void * param_value,
1075  size_t * param_value_size_ret),
1076  (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1077
1078 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1079
1080 */
1081
1082 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1083
1084
1085 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1086     (cl_context context,
1087     cl_uint count,
1088     const char ** strings,
1089     const size_t * lengths,
1090     cl_int * errcode_ret),
1091     (context, count, strings, lengths, errcode_ret))
1092
1093 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1094     (cl_context context,
1095     cl_uint num_devices,
1096     const cl_device_id * device_list,
1097     const size_t * lengths,
1098     const unsigned char ** binaries,
1099     cl_int * binary_status,
1100     cl_int * errcode_ret),
1101     (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1102
1103 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1104
1105 OCL_FUNC(cl_int, clBuildProgram,
1106     (cl_program program,
1107     cl_uint num_devices,
1108     const cl_device_id * device_list,
1109     const char * options,
1110     void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1111     void * user_data),
1112     (program, num_devices, device_list, options, pfn_notify, user_data))
1113
1114 OCL_FUNC(cl_int, clGetProgramInfo,
1115     (cl_program program,
1116     cl_program_info param_name,
1117     size_t param_value_size,
1118     void * param_value,
1119     size_t * param_value_size_ret),
1120     (program, param_name, param_value_size, param_value, param_value_size_ret))
1121
1122 OCL_FUNC(cl_int, clGetProgramBuildInfo,
1123     (cl_program program,
1124     cl_device_id device,
1125     cl_program_build_info param_name,
1126     size_t param_value_size,
1127     void * param_value,
1128     size_t * param_value_size_ret),
1129     (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1130
1131 OCL_FUNC_P(cl_kernel, clCreateKernel,
1132     (cl_program program,
1133     const char * kernel_name,
1134     cl_int * errcode_ret),
1135     (program, kernel_name, errcode_ret))
1136
1137 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1138
1139 OCL_FUNC(cl_int, clSetKernelArg,
1140     (cl_kernel kernel,
1141     cl_uint arg_index,
1142     size_t arg_size,
1143     const void * arg_value),
1144     (kernel, arg_index, arg_size, arg_value))
1145
1146 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1147     (cl_kernel kernel,
1148     cl_device_id device,
1149     cl_kernel_work_group_info param_name,
1150     size_t param_value_size,
1151     void * param_value,
1152     size_t * param_value_size_ret),
1153     (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1154
1155 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1156
1157 OCL_FUNC(cl_int, clEnqueueReadBuffer,
1158     (cl_command_queue command_queue,
1159     cl_mem buffer,
1160     cl_bool blocking_read,
1161     size_t offset,
1162     size_t size,
1163     void * ptr,
1164     cl_uint num_events_in_wait_list,
1165     const cl_event * event_wait_list,
1166     cl_event * event),
1167     (command_queue, buffer, blocking_read, offset, size, ptr,
1168     num_events_in_wait_list, event_wait_list, event))
1169
1170 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1171     (cl_command_queue command_queue,
1172     cl_mem buffer,
1173     cl_bool blocking_read,
1174     const size_t * buffer_offset,
1175     const size_t * host_offset,
1176     const size_t * region,
1177     size_t buffer_row_pitch,
1178     size_t buffer_slice_pitch,
1179     size_t host_row_pitch,
1180     size_t host_slice_pitch,
1181     void * ptr,
1182     cl_uint num_events_in_wait_list,
1183     const cl_event * event_wait_list,
1184     cl_event * event),
1185     (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1186     buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1187     event_wait_list, event))
1188
1189 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1190     (cl_command_queue command_queue,
1191     cl_mem buffer,
1192     cl_bool blocking_write,
1193     size_t offset,
1194     size_t size,
1195     const void * ptr,
1196     cl_uint num_events_in_wait_list,
1197     const cl_event * event_wait_list,
1198     cl_event * event),
1199     (command_queue, buffer, blocking_write, offset, size, ptr,
1200     num_events_in_wait_list, event_wait_list, event))
1201
1202 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1203     (cl_command_queue command_queue,
1204     cl_mem buffer,
1205     cl_bool blocking_write,
1206     const size_t * buffer_offset,
1207     const size_t * host_offset,
1208     const size_t * region,
1209     size_t buffer_row_pitch,
1210     size_t buffer_slice_pitch,
1211     size_t host_row_pitch,
1212     size_t host_slice_pitch,
1213     const void * ptr,
1214     cl_uint num_events_in_wait_list,
1215     const cl_event * event_wait_list,
1216     cl_event * event),
1217     (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1218     region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1219     host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1220
1221 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1222     (cl_command_queue command_queue,
1223     cl_mem buffer,
1224     const void * pattern,
1225     size_t pattern_size,
1226     size_t offset,
1227     size_t size,
1228     cl_uint num_events_in_wait_list,
1229     const cl_event * event_wait_list,
1230     cl_event * event),
1231     (command_queue, buffer, pattern, pattern_size, offset, size,
1232     num_events_in_wait_list, event_wait_list, event))*/
1233
1234 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1235     (cl_command_queue command_queue,
1236     cl_mem src_buffer,
1237     cl_mem dst_buffer,
1238     size_t src_offset,
1239     size_t dst_offset,
1240     size_t size,
1241     cl_uint num_events_in_wait_list,
1242     const cl_event * event_wait_list,
1243     cl_event * event),
1244     (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1245     size, num_events_in_wait_list, event_wait_list, event))
1246
1247 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1248     (cl_command_queue command_queue,
1249     cl_mem src_buffer,
1250     cl_mem dst_buffer,
1251     const size_t * src_origin,
1252     const size_t * dst_origin,
1253     const size_t * region,
1254     size_t src_row_pitch,
1255     size_t src_slice_pitch,
1256     size_t dst_row_pitch,
1257     size_t dst_slice_pitch,
1258     cl_uint num_events_in_wait_list,
1259     const cl_event * event_wait_list,
1260     cl_event * event),
1261     (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1262     region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1263     num_events_in_wait_list, event_wait_list, event))
1264
1265 OCL_FUNC_P(void*, clEnqueueMapBuffer,
1266     (cl_command_queue command_queue,
1267     cl_mem buffer,
1268     cl_bool blocking_map,
1269     cl_map_flags map_flags,
1270     size_t offset,
1271     size_t size,
1272     cl_uint num_events_in_wait_list,
1273     const cl_event * event_wait_list,
1274     cl_event * event,
1275     cl_int * errcode_ret),
1276     (command_queue, buffer, blocking_map, map_flags, offset, size,
1277     num_events_in_wait_list, event_wait_list, event, errcode_ret))
1278
1279 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1280     (cl_command_queue command_queue,
1281     cl_mem memobj,
1282     void * mapped_ptr,
1283     cl_uint num_events_in_wait_list,
1284     const cl_event * event_wait_list,
1285     cl_event * event),
1286     (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1287
1288 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1289     (cl_command_queue command_queue,
1290     cl_kernel kernel,
1291     cl_uint work_dim,
1292     const size_t * global_work_offset,
1293     const size_t * global_work_size,
1294     const size_t * local_work_size,
1295     cl_uint num_events_in_wait_list,
1296     const cl_event * event_wait_list,
1297     cl_event * event),
1298     (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1299     local_work_size, num_events_in_wait_list, event_wait_list, event))
1300
1301 OCL_FUNC(cl_int, clEnqueueTask,
1302     (cl_command_queue command_queue,
1303     cl_kernel kernel,
1304     cl_uint num_events_in_wait_list,
1305     const cl_event * event_wait_list,
1306     cl_event * event),
1307     (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1308
1309 OCL_FUNC(cl_int, clSetEventCallback,
1310     (cl_event event,
1311     cl_int command_exec_callback_type ,
1312     void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1313     void *user_data),
1314     (event, command_exec_callback_type, pfn_event_notify, user_data))
1315
1316 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1317
1318 }
1319
1320 #endif
1321
1322 #ifndef CL_VERSION_1_2
1323 #define CL_VERSION_1_2
1324 #endif
1325
1326 #endif
1327
1328 #ifdef _DEBUG
1329 #define CV_OclDbgAssert CV_DbgAssert
1330 #else
1331 static bool isRaiseError()
1332 {
1333     static bool initialized = false;
1334     static bool value = false;
1335     if (!initialized)
1336     {
1337         value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
1338         initialized = true;
1339     }
1340     return value;
1341 }
1342 #define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
1343 #endif
1344
1345 namespace cv { namespace ocl {
1346
1347 struct UMat2D
1348 {
1349     UMat2D(const UMat& m)
1350     {
1351         offset = (int)m.offset;
1352         step = (int)m.step;
1353         rows = m.rows;
1354         cols = m.cols;
1355     }
1356     int offset;
1357     int step;
1358     int rows;
1359     int cols;
1360 };
1361
1362 struct UMat3D
1363 {
1364     UMat3D(const UMat& m)
1365     {
1366         offset = (int)m.offset;
1367         step = (int)m.step.p[1];
1368         slicestep = (int)m.step.p[0];
1369         slices = (int)m.size.p[0];
1370         rows = m.size.p[1];
1371         cols = m.size.p[2];
1372     }
1373     int offset;
1374     int slicestep;
1375     int step;
1376     int slices;
1377     int rows;
1378     int cols;
1379 };
1380
1381 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
1382 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1383 {
1384     static uint64 table[256];
1385     static bool initialized = false;
1386
1387     if( !initialized )
1388     {
1389         for( int i = 0; i < 256; i++ )
1390         {
1391             uint64 c = i;
1392             for( int j = 0; j < 8; j++ )
1393                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1394             table[i] = c;
1395         }
1396         initialized = true;
1397     }
1398
1399     uint64 crc = ~crc0;
1400     for( size_t idx = 0; idx < size; idx++ )
1401         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1402
1403     return ~crc;
1404 }
1405
1406 struct HashKey
1407 {
1408     typedef uint64 part;
1409     HashKey(part _a, part _b) : a(_a), b(_b) {}
1410     part a, b;
1411 };
1412
1413 inline bool operator == (const HashKey& h1, const HashKey& h2)
1414 {
1415     return h1.a == h2.a && h1.b == h2.b;
1416 }
1417
1418 inline bool operator < (const HashKey& h1, const HashKey& h2)
1419 {
1420     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1421 }
1422
1423
1424 bool haveOpenCL()
1425 {
1426 #ifdef HAVE_OPENCL
1427     static bool g_isOpenCLInitialized = false;
1428     static bool g_isOpenCLAvailable = false;
1429
1430     if (!g_isOpenCLInitialized)
1431     {
1432         try
1433         {
1434             cl_uint n = 0;
1435             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1436         }
1437         catch (...)
1438         {
1439             g_isOpenCLAvailable = false;
1440         }
1441         g_isOpenCLInitialized = true;
1442     }
1443     return g_isOpenCLAvailable;
1444 #else
1445     return false;
1446 #endif
1447 }
1448
1449 bool useOpenCL()
1450 {
1451     CoreTLSData* data = coreTlsData.get();
1452     if( data->useOpenCL < 0 )
1453     {
1454         try
1455         {
1456             data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available();
1457         }
1458         catch (...)
1459         {
1460             data->useOpenCL = 0;
1461         }
1462     }
1463     return data->useOpenCL > 0;
1464 }
1465
1466 void setUseOpenCL(bool flag)
1467 {
1468     if( haveOpenCL() )
1469     {
1470         CoreTLSData* data = coreTlsData.get();
1471         data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1472     }
1473 }
1474
1475 #ifdef HAVE_CLAMDBLAS
1476
1477 class AmdBlasHelper
1478 {
1479 public:
1480     static AmdBlasHelper & getInstance()
1481     {
1482         static AmdBlasHelper amdBlas;
1483         return amdBlas;
1484     }
1485
1486     bool isAvailable() const
1487     {
1488         return g_isAmdBlasAvailable;
1489     }
1490
1491     ~AmdBlasHelper()
1492     {
1493         try
1494         {
1495             clAmdBlasTeardown();
1496         }
1497         catch (...) { }
1498     }
1499
1500 protected:
1501     AmdBlasHelper()
1502     {
1503         if (!g_isAmdBlasInitialized)
1504         {
1505             AutoLock lock(m);
1506
1507             if (!g_isAmdBlasInitialized && haveOpenCL())
1508             {
1509                 try
1510                 {
1511                     g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1512                 }
1513                 catch (...)
1514                 {
1515                     g_isAmdBlasAvailable = false;
1516                 }
1517             }
1518             else
1519                 g_isAmdBlasAvailable = false;
1520
1521             g_isAmdBlasInitialized = true;
1522         }
1523     }
1524
1525 private:
1526     static Mutex m;
1527     static bool g_isAmdBlasInitialized;
1528     static bool g_isAmdBlasAvailable;
1529 };
1530
1531 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1532 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1533 Mutex AmdBlasHelper::m;
1534
1535 bool haveAmdBlas()
1536 {
1537     return AmdBlasHelper::getInstance().isAvailable();
1538 }
1539
1540 #else
1541
1542 bool haveAmdBlas()
1543 {
1544     return false;
1545 }
1546
1547 #endif
1548
1549 #ifdef HAVE_CLAMDFFT
1550
1551 class AmdFftHelper
1552 {
1553 public:
1554     static AmdFftHelper & getInstance()
1555     {
1556         static AmdFftHelper amdFft;
1557         return amdFft;
1558     }
1559
1560     bool isAvailable() const
1561     {
1562         return g_isAmdFftAvailable;
1563     }
1564
1565     ~AmdFftHelper()
1566     {
1567         try
1568         {
1569 //            clAmdFftTeardown();
1570         }
1571         catch (...) { }
1572     }
1573
1574 protected:
1575     AmdFftHelper()
1576     {
1577         if (!g_isAmdFftInitialized)
1578         {
1579             AutoLock lock(m);
1580
1581             if (!g_isAmdFftInitialized && haveOpenCL())
1582             {
1583                 try
1584                 {
1585                     cl_uint major, minor, patch;
1586                     CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1587
1588                     // it throws exception in case AmdFft binaries are not found
1589                     CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
1590                     g_isAmdFftAvailable = true;
1591                 }
1592                 catch (const Exception &)
1593                 {
1594                     g_isAmdFftAvailable = false;
1595                 }
1596             }
1597             else
1598                 g_isAmdFftAvailable = false;
1599
1600             g_isAmdFftInitialized = true;
1601         }
1602     }
1603
1604 private:
1605     static clAmdFftSetupData setupData;
1606     static Mutex m;
1607     static bool g_isAmdFftInitialized;
1608     static bool g_isAmdFftAvailable;
1609 };
1610
1611 clAmdFftSetupData AmdFftHelper::setupData;
1612 bool AmdFftHelper::g_isAmdFftAvailable = false;
1613 bool AmdFftHelper::g_isAmdFftInitialized = false;
1614 Mutex AmdFftHelper::m;
1615
1616 bool haveAmdFft()
1617 {
1618     return AmdFftHelper::getInstance().isAvailable();
1619 }
1620
1621 #else
1622
1623 bool haveAmdFft()
1624 {
1625     return false;
1626 }
1627
1628 #endif
1629
1630 void finish()
1631 {
1632     Queue::getDefault().finish();
1633 }
1634
1635 #define IMPLEMENT_REFCOUNTABLE() \
1636     void addref() { CV_XADD(&refcount, 1); } \
1637     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1638     int refcount
1639
1640 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1641
1642 struct Platform::Impl
1643 {
1644     Impl()
1645     {
1646         refcount = 1;
1647         handle = 0;
1648         initialized = false;
1649     }
1650
1651     ~Impl() {}
1652
1653     void init()
1654     {
1655         if( !initialized )
1656         {
1657             //cl_uint num_entries
1658             cl_uint n = 0;
1659             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1660                 handle = 0;
1661             if( handle != 0 )
1662             {
1663                 char buf[1000];
1664                 size_t len = 0;
1665                 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1666                 buf[len] = '\0';
1667                 vendor = String(buf);
1668             }
1669
1670             initialized = true;
1671         }
1672     }
1673
1674     IMPLEMENT_REFCOUNTABLE();
1675
1676     cl_platform_id handle;
1677     String vendor;
1678     bool initialized;
1679 };
1680
1681 Platform::Platform()
1682 {
1683     p = 0;
1684 }
1685
1686 Platform::~Platform()
1687 {
1688     if(p)
1689         p->release();
1690 }
1691
1692 Platform::Platform(const Platform& pl)
1693 {
1694     p = (Impl*)pl.p;
1695     if(p)
1696         p->addref();
1697 }
1698
1699 Platform& Platform::operator = (const Platform& pl)
1700 {
1701     Impl* newp = (Impl*)pl.p;
1702     if(newp)
1703         newp->addref();
1704     if(p)
1705         p->release();
1706     p = newp;
1707     return *this;
1708 }
1709
1710 void* Platform::ptr() const
1711 {
1712     return p ? p->handle : 0;
1713 }
1714
1715 Platform& Platform::getDefault()
1716 {
1717     static Platform p;
1718     if( !p.p )
1719     {
1720         p.p = new Impl;
1721         p.p->init();
1722     }
1723     return p;
1724 }
1725
1726 /////////////////////////////////////// Device ////////////////////////////////////////////
1727
1728 // deviceVersion has format
1729 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1730 // by specification
1731 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1732 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1733 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1734 {
1735     major = minor = 0;
1736     if (10 >= deviceVersion.length())
1737         return;
1738     const char *pstr = deviceVersion.c_str();
1739     if (0 != strncmp(pstr, "OpenCL ", 7))
1740         return;
1741     size_t ppos = deviceVersion.find('.', 7);
1742     if (String::npos == ppos)
1743         return;
1744     String temp = deviceVersion.substr(7, ppos - 7);
1745     major = atoi(temp.c_str());
1746     temp = deviceVersion.substr(ppos + 1);
1747     minor = atoi(temp.c_str());
1748 }
1749
1750 struct Device::Impl
1751 {
1752     Impl(void* d)
1753     {
1754         handle = (cl_device_id)d;
1755         refcount = 1;
1756
1757         name_ = getStrProp(CL_DEVICE_NAME);
1758         version_ = getStrProp(CL_DEVICE_VERSION);
1759         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1760         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1761         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1762         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1763         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1764         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1765
1766         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1767         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1768
1769         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1770         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1771             vendorName_ == "AMD")
1772             vendorID_ = VENDOR_AMD;
1773         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
1774             vendorID_ = VENDOR_INTEL;
1775         else if (vendorName_ == "NVIDIA Corporation")
1776             vendorID_ = VENDOR_NVIDIA;
1777         else
1778             vendorID_ = UNKNOWN_VENDOR;
1779     }
1780
1781     template<typename _TpCL, typename _TpOut>
1782     _TpOut getProp(cl_device_info prop) const
1783     {
1784         _TpCL temp=_TpCL();
1785         size_t sz = 0;
1786
1787         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1788             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1789     }
1790
1791     bool getBoolProp(cl_device_info prop) const
1792     {
1793         cl_bool temp = CL_FALSE;
1794         size_t sz = 0;
1795
1796         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1797             sz == sizeof(temp) ? temp != 0 : false;
1798     }
1799
1800     String getStrProp(cl_device_info prop) const
1801     {
1802         char buf[1024];
1803         size_t sz=0;
1804         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1805             sz < sizeof(buf) ? String(buf) : String();
1806     }
1807
1808     IMPLEMENT_REFCOUNTABLE();
1809     cl_device_id handle;
1810
1811     String name_;
1812     String version_;
1813     int doubleFPConfig_;
1814     bool hostUnifiedMemory_;
1815     int maxComputeUnits_;
1816     size_t maxWorkGroupSize_;
1817     int type_;
1818     int deviceVersionMajor_;
1819     int deviceVersionMinor_;
1820     String driverVersion_;
1821     String vendorName_;
1822     int vendorID_;
1823 };
1824
1825
1826 Device::Device()
1827 {
1828     p = 0;
1829 }
1830
1831 Device::Device(void* d)
1832 {
1833     p = 0;
1834     set(d);
1835 }
1836
1837 Device::Device(const Device& d)
1838 {
1839     p = d.p;
1840     if(p)
1841         p->addref();
1842 }
1843
1844 Device& Device::operator = (const Device& d)
1845 {
1846     Impl* newp = (Impl*)d.p;
1847     if(newp)
1848         newp->addref();
1849     if(p)
1850         p->release();
1851     p = newp;
1852     return *this;
1853 }
1854
1855 Device::~Device()
1856 {
1857     if(p)
1858         p->release();
1859 }
1860
1861 void Device::set(void* d)
1862 {
1863     if(p)
1864         p->release();
1865     p = new Impl(d);
1866 }
1867
1868 void* Device::ptr() const
1869 {
1870     return p ? p->handle : 0;
1871 }
1872
1873 String Device::name() const
1874 { return p ? p->name_ : String(); }
1875
1876 String Device::extensions() const
1877 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1878
1879 String Device::version() const
1880 { return p ? p->version_ : String(); }
1881
1882 String Device::vendorName() const
1883 { return p ? p->vendorName_ : String(); }
1884
1885 int Device::vendorID() const
1886 { return p ? p->vendorID_ : 0; }
1887
1888 String Device::OpenCL_C_Version() const
1889 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1890
1891 String Device::OpenCLVersion() const
1892 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1893
1894 int Device::deviceVersionMajor() const
1895 { return p ? p->deviceVersionMajor_ : 0; }
1896
1897 int Device::deviceVersionMinor() const
1898 { return p ? p->deviceVersionMinor_ : 0; }
1899
1900 String Device::driverVersion() const
1901 { return p ? p->driverVersion_ : String(); }
1902
1903 int Device::type() const
1904 { return p ? p->type_ : 0; }
1905
1906 int Device::addressBits() const
1907 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1908
1909 bool Device::available() const
1910 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1911
1912 bool Device::compilerAvailable() const
1913 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1914
1915 bool Device::linkerAvailable() const
1916 #ifdef CL_VERSION_1_2
1917 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1918 #else
1919 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1920 #endif
1921
1922 int Device::doubleFPConfig() const
1923 { return p ? p->doubleFPConfig_ : 0; }
1924
1925 int Device::singleFPConfig() const
1926 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1927
1928 int Device::halfFPConfig() const
1929 #ifdef CL_VERSION_1_2
1930 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1931 #else
1932 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1933 #endif
1934
1935 bool Device::endianLittle() const
1936 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1937
1938 bool Device::errorCorrectionSupport() const
1939 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1940
1941 int Device::executionCapabilities() const
1942 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1943
1944 size_t Device::globalMemCacheSize() const
1945 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1946
1947 int Device::globalMemCacheType() const
1948 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1949
1950 int Device::globalMemCacheLineSize() const
1951 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1952
1953 size_t Device::globalMemSize() const
1954 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1955
1956 size_t Device::localMemSize() const
1957 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1958
1959 int Device::localMemType() const
1960 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1961
1962 bool Device::hostUnifiedMemory() const
1963 { return p ? p->hostUnifiedMemory_ : false; }
1964
1965 bool Device::imageSupport() const
1966 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1967
1968 bool Device::imageFromBufferSupport() const
1969 {
1970     bool ret = false;
1971     if (p)
1972     {
1973         size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
1974         if (pos != String::npos)
1975         {
1976             ret = true;
1977         }
1978     }
1979     return ret;
1980 }
1981
1982 uint Device::imagePitchAlignment() const
1983 {
1984 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1985     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1986 #else
1987     return 0;
1988 #endif
1989 }
1990
1991 uint Device::imageBaseAddressAlignment() const
1992 {
1993 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1994     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1995 #else
1996     return 0;
1997 #endif
1998 }
1999
2000 size_t Device::image2DMaxWidth() const
2001 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
2002
2003 size_t Device::image2DMaxHeight() const
2004 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
2005
2006 size_t Device::image3DMaxWidth() const
2007 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
2008
2009 size_t Device::image3DMaxHeight() const
2010 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
2011
2012 size_t Device::image3DMaxDepth() const
2013 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
2014
2015 size_t Device::imageMaxBufferSize() const
2016 #ifdef CL_VERSION_1_2
2017 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
2018 #else
2019 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2020 #endif
2021
2022 size_t Device::imageMaxArraySize() const
2023 #ifdef CL_VERSION_1_2
2024 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
2025 #else
2026 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2027 #endif
2028
2029 int Device::maxClockFrequency() const
2030 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
2031
2032 int Device::maxComputeUnits() const
2033 { return p ? p->maxComputeUnits_ : 0; }
2034
2035 int Device::maxConstantArgs() const
2036 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
2037
2038 size_t Device::maxConstantBufferSize() const
2039 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
2040
2041 size_t Device::maxMemAllocSize() const
2042 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
2043
2044 size_t Device::maxParameterSize() const
2045 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
2046
2047 int Device::maxReadImageArgs() const
2048 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2049
2050 int Device::maxWriteImageArgs() const
2051 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2052
2053 int Device::maxSamplers() const
2054 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2055
2056 size_t Device::maxWorkGroupSize() const
2057 { return p ? p->maxWorkGroupSize_ : 0; }
2058
2059 int Device::maxWorkItemDims() const
2060 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2061
2062 void Device::maxWorkItemSizes(size_t* sizes) const
2063 {
2064     if(p)
2065     {
2066         const int MAX_DIMS = 32;
2067         size_t retsz = 0;
2068         CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2069                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2070     }
2071 }
2072
2073 int Device::memBaseAddrAlign() const
2074 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2075
2076 int Device::nativeVectorWidthChar() const
2077 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2078
2079 int Device::nativeVectorWidthShort() const
2080 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2081
2082 int Device::nativeVectorWidthInt() const
2083 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2084
2085 int Device::nativeVectorWidthLong() const
2086 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2087
2088 int Device::nativeVectorWidthFloat() const
2089 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2090
2091 int Device::nativeVectorWidthDouble() const
2092 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2093
2094 int Device::nativeVectorWidthHalf() const
2095 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2096
2097 int Device::preferredVectorWidthChar() const
2098 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2099
2100 int Device::preferredVectorWidthShort() const
2101 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2102
2103 int Device::preferredVectorWidthInt() const
2104 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2105
2106 int Device::preferredVectorWidthLong() const
2107 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2108
2109 int Device::preferredVectorWidthFloat() const
2110 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2111
2112 int Device::preferredVectorWidthDouble() const
2113 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2114
2115 int Device::preferredVectorWidthHalf() const
2116 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2117
2118 size_t Device::printfBufferSize() const
2119 #ifdef CL_VERSION_1_2
2120 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2121 #else
2122 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2123 #endif
2124
2125
2126 size_t Device::profilingTimerResolution() const
2127 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2128
2129 const Device& Device::getDefault()
2130 {
2131     const Context& ctx = Context::getDefault();
2132     int idx = coreTlsData.get()->device;
2133     const Device& device = ctx.device(idx);
2134     return device;
2135 }
2136
2137 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2138
2139 template <typename Functor, typename ObjectType>
2140 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2141 {
2142     ::size_t required;
2143     cl_int err = f(obj, name, 0, NULL, &required);
2144     if (err != CL_SUCCESS)
2145         return err;
2146
2147     param.clear();
2148     if (required > 0)
2149     {
2150         AutoBuffer<char> buf(required + 1);
2151         char* ptr = (char*)buf; // cleanup is not needed
2152         err = f(obj, name, required, ptr, NULL);
2153         if (err != CL_SUCCESS)
2154             return err;
2155         param = ptr;
2156     }
2157
2158     return CL_SUCCESS;
2159 }
2160
2161 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2162 {
2163     elems.clear();
2164     if (s.size() == 0)
2165         return;
2166     std::istringstream ss(s);
2167     std::string item;
2168     while (!ss.eof())
2169     {
2170         std::getline(ss, item, delim);
2171         elems.push_back(item);
2172     }
2173 }
2174
2175 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2176 // Sample: AMD:GPU:
2177 // Sample: AMD:GPU:Tahiti
2178 // Sample: :GPU|CPU: = '' = ':' = '::'
2179 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2180         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2181 {
2182     std::vector<std::string> parts;
2183     split(configurationStr, ':', parts);
2184     if (parts.size() > 3)
2185     {
2186         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2187         return false;
2188     }
2189     if (parts.size() > 2)
2190         deviceNameOrID = parts[2];
2191     if (parts.size() > 1)
2192     {
2193         split(parts[1], '|', deviceTypes);
2194     }
2195     if (parts.size() > 0)
2196     {
2197         platform = parts[0];
2198     }
2199     return true;
2200 }
2201
2202 #ifdef HAVE_WINRT
2203 static cl_device_id selectOpenCLDevice()
2204 {
2205     return NULL;
2206 }
2207 #else
2208 static cl_device_id selectOpenCLDevice()
2209 {
2210     std::string platform, deviceName;
2211     std::vector<std::string> deviceTypes;
2212
2213     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2214     if (configuration &&
2215             (strcmp(configuration, "disabled") == 0 ||
2216              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
2217             ))
2218         return NULL;
2219
2220     bool isID = false;
2221     int deviceID = -1;
2222     if (deviceName.length() == 1)
2223     // We limit ID range to 0..9, because we want to write:
2224     // - '2500' to mean i5-2500
2225     // - '8350' to mean AMD FX-8350
2226     // - '650' to mean GeForce 650
2227     // To extend ID range change condition to '> 0'
2228     {
2229         isID = true;
2230         for (size_t i = 0; i < deviceName.length(); i++)
2231         {
2232             if (!isdigit(deviceName[i]))
2233             {
2234                 isID = false;
2235                 break;
2236             }
2237         }
2238         if (isID)
2239         {
2240             deviceID = atoi(deviceName.c_str());
2241             if (deviceID < 0)
2242                 return NULL;
2243         }
2244     }
2245
2246     std::vector<cl_platform_id> platforms;
2247     {
2248         cl_uint numPlatforms = 0;
2249         CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2250
2251         if (numPlatforms == 0)
2252             return NULL;
2253         platforms.resize((size_t)numPlatforms);
2254         CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2255         platforms.resize(numPlatforms);
2256     }
2257
2258     int selectedPlatform = -1;
2259     if (platform.length() > 0)
2260     {
2261         for (size_t i = 0; i < platforms.size(); i++)
2262         {
2263             std::string name;
2264             CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2265             if (name.find(platform) != std::string::npos)
2266             {
2267                 selectedPlatform = (int)i;
2268                 break;
2269             }
2270         }
2271         if (selectedPlatform == -1)
2272         {
2273             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2274             goto not_found;
2275         }
2276     }
2277     if (deviceTypes.size() == 0)
2278     {
2279         if (!isID)
2280         {
2281             deviceTypes.push_back("GPU");
2282             if (configuration)
2283                 deviceTypes.push_back("CPU");
2284         }
2285         else
2286             deviceTypes.push_back("ALL");
2287     }
2288     for (size_t t = 0; t < deviceTypes.size(); t++)
2289     {
2290         int deviceType = 0;
2291         std::string tempStrDeviceType = deviceTypes[t];
2292         std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2293
2294         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2295             deviceType = Device::TYPE_GPU;
2296         else if (tempStrDeviceType == "cpu")
2297             deviceType = Device::TYPE_CPU;
2298         else if (tempStrDeviceType == "accelerator")
2299             deviceType = Device::TYPE_ACCELERATOR;
2300         else if (tempStrDeviceType == "all")
2301             deviceType = Device::TYPE_ALL;
2302         else
2303         {
2304             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2305             goto not_found;
2306         }
2307
2308         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2309         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2310                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2311                 i++)
2312         {
2313             cl_uint count = 0;
2314             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2315             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2316             if (count == 0)
2317                 continue;
2318             size_t base = devices.size();
2319             devices.resize(base + count);
2320             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2321             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2322         }
2323
2324         for (size_t i = (isID ? deviceID : 0);
2325              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2326              i++)
2327         {
2328             std::string name;
2329             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2330             cl_bool useGPU = true;
2331             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2332             {
2333                 cl_bool isIGPU = CL_FALSE;
2334                 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2335                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2336             }
2337             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2338             {
2339                 // TODO check for OpenCL 1.1
2340                 return devices[i];
2341             }
2342         }
2343     }
2344
2345 not_found:
2346     if (!configuration)
2347         return NULL; // suppress messages on stderr
2348
2349     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2350             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2351             << "    Device types: ";
2352     for (size_t t = 0; t < deviceTypes.size(); t++)
2353         std::cerr << deviceTypes[t] << " ";
2354
2355     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2356     return NULL;
2357 }
2358 #endif
2359
2360 struct Context::Impl
2361 {
2362     Impl()
2363     {
2364         refcount = 1;
2365         handle = 0;
2366     }
2367
2368     void setDefault()
2369     {
2370         CV_Assert(handle == NULL);
2371
2372         cl_device_id d = selectOpenCLDevice();
2373
2374         if (d == NULL)
2375             return;
2376
2377         cl_platform_id pl = NULL;
2378         CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2379
2380         cl_context_properties prop[] =
2381         {
2382             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2383             0
2384         };
2385
2386         // !!! in the current implementation force the number of devices to 1 !!!
2387         cl_uint nd = 1;
2388         cl_int status;
2389
2390         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2391
2392         bool ok = handle != 0 && status == CL_SUCCESS;
2393         if( ok )
2394         {
2395             devices.resize(nd);
2396             devices[0].set(d);
2397         }
2398         else
2399             handle = NULL;
2400     }
2401
2402     Impl(int dtype0)
2403     {
2404         refcount = 1;
2405         handle = 0;
2406
2407         cl_int retval = 0;
2408         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2409         cl_context_properties prop[] =
2410         {
2411             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2412             0
2413         };
2414
2415         cl_uint i, nd0 = 0, nd = 0;
2416         int dtype = dtype0 & 15;
2417         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2418
2419         AutoBuffer<void*> dlistbuf(nd0*2+1);
2420         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2421         cl_device_id* dlist_new = dlist + nd0;
2422         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2423         String name0;
2424
2425         for(i = 0; i < nd0; i++)
2426         {
2427             Device d(dlist[i]);
2428             if( !d.available() || !d.compilerAvailable() )
2429                 continue;
2430             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2431                 continue;
2432             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2433                 continue;
2434             String name = d.name();
2435             if( nd != 0 && name != name0 )
2436                 continue;
2437             name0 = name;
2438             dlist_new[nd++] = dlist[i];
2439         }
2440
2441         if(nd == 0)
2442             return;
2443
2444         // !!! in the current implementation force the number of devices to 1 !!!
2445         nd = 1;
2446
2447         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2448         bool ok = handle != 0 && retval == CL_SUCCESS;
2449         if( ok )
2450         {
2451             devices.resize(nd);
2452             for( i = 0; i < nd; i++ )
2453                 devices[i].set(dlist_new[i]);
2454         }
2455     }
2456
2457     ~Impl()
2458     {
2459         if(handle)
2460         {
2461             clReleaseContext(handle);
2462             handle = NULL;
2463         }
2464         devices.clear();
2465     }
2466
2467     Program getProg(const ProgramSource& src,
2468                     const String& buildflags, String& errmsg)
2469     {
2470         String prefix = Program::getPrefix(buildflags);
2471         HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2472         phash_t::iterator it = phash.find(k);
2473         if( it != phash.end() )
2474             return it->second;
2475         //String filename = format("%08x%08x_%08x%08x.clb2",
2476         Program prog(src, buildflags, errmsg);
2477         if(prog.ptr())
2478             phash.insert(std::pair<HashKey,Program>(k, prog));
2479         return prog;
2480     }
2481
2482     IMPLEMENT_REFCOUNTABLE();
2483
2484     cl_context handle;
2485     std::vector<Device> devices;
2486
2487     typedef ProgramSource::hash_t hash_t;
2488
2489     struct HashKey
2490     {
2491         HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
2492         bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
2493         bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
2494         bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2495         hash_t a, b;
2496     };
2497     typedef std::map<HashKey, Program> phash_t;
2498     phash_t phash;
2499 };
2500
2501
2502 Context::Context()
2503 {
2504     p = 0;
2505 }
2506
2507 Context::Context(int dtype)
2508 {
2509     p = 0;
2510     create(dtype);
2511 }
2512
2513 bool Context::create()
2514 {
2515     if( !haveOpenCL() )
2516         return false;
2517     if(p)
2518         p->release();
2519     p = new Impl();
2520     if(!p->handle)
2521     {
2522         delete p;
2523         p = 0;
2524     }
2525     return p != 0;
2526 }
2527
2528 bool Context::create(int dtype0)
2529 {
2530     if( !haveOpenCL() )
2531         return false;
2532     if(p)
2533         p->release();
2534     p = new Impl(dtype0);
2535     if(!p->handle)
2536     {
2537         delete p;
2538         p = 0;
2539     }
2540     return p != 0;
2541 }
2542
2543 Context::~Context()
2544 {
2545     if (p)
2546     {
2547         p->release();
2548         p = NULL;
2549     }
2550 }
2551
2552 Context::Context(const Context& c)
2553 {
2554     p = (Impl*)c.p;
2555     if(p)
2556         p->addref();
2557 }
2558
2559 Context& Context::operator = (const Context& c)
2560 {
2561     Impl* newp = (Impl*)c.p;
2562     if(newp)
2563         newp->addref();
2564     if(p)
2565         p->release();
2566     p = newp;
2567     return *this;
2568 }
2569
2570 void* Context::ptr() const
2571 {
2572     return p == NULL ? NULL : p->handle;
2573 }
2574
2575 size_t Context::ndevices() const
2576 {
2577     return p ? p->devices.size() : 0;
2578 }
2579
2580 const Device& Context::device(size_t idx) const
2581 {
2582     static Device dummy;
2583     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2584 }
2585
2586 Context& Context::getDefault(bool initialize)
2587 {
2588     static Context* ctx = new Context();
2589     if(!ctx->p && haveOpenCL())
2590     {
2591         if (!ctx->p)
2592             ctx->p = new Impl();
2593         if (initialize)
2594         {
2595             // do not create new Context right away.
2596             // First, try to retrieve existing context of the same type.
2597             // In its turn, Platform::getContext() may call Context::create()
2598             // if there is no such context.
2599             if (ctx->p->handle == NULL)
2600                 ctx->p->setDefault();
2601         }
2602     }
2603
2604     return *ctx;
2605 }
2606
2607 Program Context::getProg(const ProgramSource& prog,
2608                          const String& buildopts, String& errmsg)
2609 {
2610     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2611 }
2612
2613 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2614 {
2615     cl_context context = (cl_context)_context;
2616     cl_device_id device = (cl_device_id)_device;
2617
2618     // cleanup old context
2619     Context::Impl * impl = ctx.p;
2620     if (impl->handle)
2621     {
2622         CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2623     }
2624     impl->devices.clear();
2625
2626     impl->handle = context;
2627     impl->devices.resize(1);
2628     impl->devices[0].set(device);
2629
2630     Platform& p = Platform::getDefault();
2631     Platform::Impl* pImpl = p.p;
2632     pImpl->handle = (cl_platform_id)platform;
2633 }
2634
2635 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2636
2637 struct Queue::Impl
2638 {
2639     Impl(const Context& c, const Device& d)
2640     {
2641         refcount = 1;
2642         const Context* pc = &c;
2643         cl_context ch = (cl_context)pc->ptr();
2644         if( !ch )
2645         {
2646             pc = &Context::getDefault();
2647             ch = (cl_context)pc->ptr();
2648         }
2649         cl_device_id dh = (cl_device_id)d.ptr();
2650         if( !dh )
2651             dh = (cl_device_id)pc->device(0).ptr();
2652         cl_int retval = 0;
2653         handle = clCreateCommandQueue(ch, dh, 0, &retval);
2654         CV_OclDbgAssert(retval == CL_SUCCESS);
2655     }
2656
2657     ~Impl()
2658     {
2659 #ifdef _WIN32
2660         if (!cv::__termination)
2661 #endif
2662         {
2663             if(handle)
2664             {
2665                 clFinish(handle);
2666                 clReleaseCommandQueue(handle);
2667                 handle = NULL;
2668             }
2669         }
2670     }
2671
2672     IMPLEMENT_REFCOUNTABLE();
2673
2674     cl_command_queue handle;
2675 };
2676
2677 Queue::Queue()
2678 {
2679     p = 0;
2680 }
2681
2682 Queue::Queue(const Context& c, const Device& d)
2683 {
2684     p = 0;
2685     create(c, d);
2686 }
2687
2688 Queue::Queue(const Queue& q)
2689 {
2690     p = q.p;
2691     if(p)
2692         p->addref();
2693 }
2694
2695 Queue& Queue::operator = (const Queue& q)
2696 {
2697     Impl* newp = (Impl*)q.p;
2698     if(newp)
2699         newp->addref();
2700     if(p)
2701         p->release();
2702     p = newp;
2703     return *this;
2704 }
2705
2706 Queue::~Queue()
2707 {
2708     if(p)
2709         p->release();
2710 }
2711
2712 bool Queue::create(const Context& c, const Device& d)
2713 {
2714     if(p)
2715         p->release();
2716     p = new Impl(c, d);
2717     return p->handle != 0;
2718 }
2719
2720 void Queue::finish()
2721 {
2722     if(p && p->handle)
2723     {
2724         CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
2725     }
2726 }
2727
2728 void* Queue::ptr() const
2729 {
2730     return p ? p->handle : 0;
2731 }
2732
2733 Queue& Queue::getDefault()
2734 {
2735     Queue& q = coreTlsData.get()->oclQueue;
2736     if( !q.p && haveOpenCL() )
2737         q.create(Context::getDefault());
2738     return q;
2739 }
2740
2741 static cl_command_queue getQueue(const Queue& q)
2742 {
2743     cl_command_queue qq = (cl_command_queue)q.ptr();
2744     if(!qq)
2745         qq = (cl_command_queue)Queue::getDefault().ptr();
2746     return qq;
2747 }
2748
2749 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2750
2751 KernelArg::KernelArg()
2752     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2753 {
2754 }
2755
2756 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2757     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2758 {
2759 }
2760
2761 KernelArg KernelArg::Constant(const Mat& m)
2762 {
2763     CV_Assert(m.isContinuous());
2764     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
2765 }
2766
2767 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2768
2769 struct Kernel::Impl
2770 {
2771     Impl(const char* kname, const Program& prog) :
2772         refcount(1), e(0), nu(0)
2773     {
2774         cl_program ph = (cl_program)prog.ptr();
2775         cl_int retval = 0;
2776         handle = ph != 0 ?
2777             clCreateKernel(ph, kname, &retval) : 0;
2778         CV_OclDbgAssert(retval == CL_SUCCESS);
2779         for( int i = 0; i < MAX_ARRS; i++ )
2780             u[i] = 0;
2781         haveTempDstUMats = false;
2782     }
2783
2784     void cleanupUMats()
2785     {
2786         for( int i = 0; i < MAX_ARRS; i++ )
2787             if( u[i] )
2788             {
2789                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2790                     u[i]->currAllocator->deallocate(u[i]);
2791                 u[i] = 0;
2792             }
2793         nu = 0;
2794         haveTempDstUMats = false;
2795     }
2796
2797     void addUMat(const UMat& m, bool dst)
2798     {
2799         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2800         u[nu] = m.u;
2801         CV_XADD(&m.u->urefcount, 1);
2802         nu++;
2803         if(dst && m.u->tempUMat())
2804             haveTempDstUMats = true;
2805     }
2806
2807     void addImage(const Image2D& image)
2808     {
2809         images.push_back(image);
2810     }
2811
2812     void finit()
2813     {
2814         cleanupUMats();
2815         images.clear();
2816         if(e) { clReleaseEvent(e); e = 0; }
2817         release();
2818     }
2819
2820     ~Impl()
2821     {
2822         if(handle)
2823             clReleaseKernel(handle);
2824     }
2825
2826     IMPLEMENT_REFCOUNTABLE();
2827
2828     cl_kernel handle;
2829     cl_event e;
2830     enum { MAX_ARRS = 16 };
2831     UMatData* u[MAX_ARRS];
2832     int nu;
2833     std::list<Image2D> images;
2834     bool haveTempDstUMats;
2835 };
2836
2837 }}
2838
2839 extern "C"
2840 {
2841 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
2842 {
2843     ((cv::ocl::Kernel::Impl*)p)->finit();
2844 }
2845
2846 }
2847
2848 namespace cv { namespace ocl {
2849
2850 Kernel::Kernel()
2851 {
2852     p = 0;
2853 }
2854
2855 Kernel::Kernel(const char* kname, const Program& prog)
2856 {
2857     p = 0;
2858     create(kname, prog);
2859 }
2860
2861 Kernel::Kernel(const char* kname, const ProgramSource& src,
2862                const String& buildopts, String* errmsg)
2863 {
2864     p = 0;
2865     create(kname, src, buildopts, errmsg);
2866 }
2867
2868 Kernel::Kernel(const Kernel& k)
2869 {
2870     p = k.p;
2871     if(p)
2872         p->addref();
2873 }
2874
2875 Kernel& Kernel::operator = (const Kernel& k)
2876 {
2877     Impl* newp = (Impl*)k.p;
2878     if(newp)
2879         newp->addref();
2880     if(p)
2881         p->release();
2882     p = newp;
2883     return *this;
2884 }
2885
2886 Kernel::~Kernel()
2887 {
2888     if(p)
2889         p->release();
2890 }
2891
2892 bool Kernel::create(const char* kname, const Program& prog)
2893 {
2894     if(p)
2895         p->release();
2896     p = new Impl(kname, prog);
2897     if(p->handle == 0)
2898     {
2899         p->release();
2900         p = 0;
2901     }
2902 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
2903     CV_Assert(p);
2904 #endif
2905     return p != 0;
2906 }
2907
2908 bool Kernel::create(const char* kname, const ProgramSource& src,
2909                     const String& buildopts, String* errmsg)
2910 {
2911     if(p)
2912     {
2913         p->release();
2914         p = 0;
2915     }
2916     String tempmsg;
2917     if( !errmsg ) errmsg = &tempmsg;
2918     const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2919     return create(kname, prog);
2920 }
2921
2922 void* Kernel::ptr() const
2923 {
2924     return p ? p->handle : 0;
2925 }
2926
2927 bool Kernel::empty() const
2928 {
2929     return ptr() == 0;
2930 }
2931
2932 int Kernel::set(int i, const void* value, size_t sz)
2933 {
2934     if (!p || !p->handle)
2935         return -1;
2936     if (i < 0)
2937         return i;
2938     if( i == 0 )
2939         p->cleanupUMats();
2940
2941     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2942     CV_OclDbgAssert(retval == CL_SUCCESS);
2943     if (retval != CL_SUCCESS)
2944         return -1;
2945     return i+1;
2946 }
2947
2948 int Kernel::set(int i, const Image2D& image2D)
2949 {
2950     p->addImage(image2D);
2951     cl_mem h = (cl_mem)image2D.ptr();
2952     return set(i, &h, sizeof(h));
2953 }
2954
2955 int Kernel::set(int i, const UMat& m)
2956 {
2957     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
2958 }
2959
2960 int Kernel::set(int i, const KernelArg& arg)
2961 {
2962     if( !p || !p->handle )
2963         return -1;
2964     if (i < 0)
2965         return i;
2966     if( i == 0 )
2967         p->cleanupUMats();
2968     if( arg.m )
2969     {
2970         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
2971                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
2972         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
2973         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
2974
2975         if (!h)
2976         {
2977             p->release();
2978             p = 0;
2979             return -1;
2980         }
2981
2982         if (ptronly)
2983             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h) == CL_SUCCESS);
2984         else if( arg.m->dims <= 2 )
2985         {
2986             UMat2D u2d(*arg.m);
2987             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2988             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
2989             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
2990             i += 3;
2991
2992             if( !(arg.flags & KernelArg::NO_SIZE) )
2993             {
2994                 int cols = u2d.cols*arg.wscale/arg.iwscale;
2995                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
2996                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
2997                 i += 2;
2998             }
2999         }
3000         else
3001         {
3002             UMat3D u3d(*arg.m);
3003             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
3004             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
3005             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
3006             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
3007             i += 4;
3008             if( !(arg.flags & KernelArg::NO_SIZE) )
3009             {
3010                 int cols = u3d.cols*arg.wscale/arg.iwscale;
3011                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
3012                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
3013                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
3014                 i += 3;
3015             }
3016         }
3017         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
3018         return i;
3019     }
3020     CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
3021     return i+1;
3022 }
3023
3024
3025 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
3026                  bool sync, const Queue& q)
3027 {
3028     if(!p || !p->handle || p->e != 0)
3029         return false;
3030
3031     cl_command_queue qq = getQueue(q);
3032     size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
3033     size_t total = 1;
3034     CV_Assert(_globalsize != 0);
3035     for (int i = 0; i < dims; i++)
3036     {
3037         size_t val = _localsize ? _localsize[i] :
3038             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
3039         CV_Assert( val > 0 );
3040         total *= _globalsize[i];
3041         globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
3042     }
3043     if( total == 0 )
3044         return true;
3045     if( p->haveTempDstUMats )
3046         sync = true;
3047     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
3048                                            offset, globalsize, _localsize, 0, 0,
3049                                            sync ? 0 : &p->e);
3050 #if CV_OPENCL_SHOW_RUN_ERRORS
3051     if (retval != CL_SUCCESS)
3052     {
3053         printf("OpenCL program returns error: %d\n", retval);
3054         fflush(stdout);
3055     }
3056 #endif
3057     if( sync || retval != CL_SUCCESS )
3058     {
3059         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3060         p->cleanupUMats();
3061     }
3062     else
3063     {
3064         p->addref();
3065         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3066     }
3067     return retval == CL_SUCCESS;
3068 }
3069
3070 bool Kernel::runTask(bool sync, const Queue& q)
3071 {
3072     if(!p || !p->handle || p->e != 0)
3073         return false;
3074
3075     cl_command_queue qq = getQueue(q);
3076     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3077     if( sync || retval != CL_SUCCESS )
3078     {
3079         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3080         p->cleanupUMats();
3081     }
3082     else
3083     {
3084         p->addref();
3085         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3086     }
3087     return retval == CL_SUCCESS;
3088 }
3089
3090
3091 size_t Kernel::workGroupSize() const
3092 {
3093     if(!p || !p->handle)
3094         return 0;
3095     size_t val = 0, retsz = 0;
3096     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3097     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3098                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3099 }
3100
3101 size_t Kernel::preferedWorkGroupSizeMultiple() const
3102 {
3103     if(!p || !p->handle)
3104         return 0;
3105     size_t val = 0, retsz = 0;
3106     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3107     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3108                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3109 }
3110
3111 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3112 {
3113     if(!p || !p->handle || !wsz)
3114         return 0;
3115     size_t retsz = 0;
3116     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3117     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3118                                     sizeof(wsz[0]*3), wsz, &retsz) == CL_SUCCESS;
3119 }
3120
3121 size_t Kernel::localMemSize() const
3122 {
3123     if(!p || !p->handle)
3124         return 0;
3125     size_t retsz = 0;
3126     cl_ulong val = 0;
3127     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3128     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3129                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3130 }
3131
3132 /////////////////////////////////////////// Program /////////////////////////////////////////////
3133
3134 struct Program::Impl
3135 {
3136     Impl(const ProgramSource& _src,
3137          const String& _buildflags, String& errmsg)
3138     {
3139         refcount = 1;
3140         const Context& ctx = Context::getDefault();
3141         src = _src;
3142         buildflags = _buildflags;
3143         const String& srcstr = src.source();
3144         const char* srcptr = srcstr.c_str();
3145         size_t srclen = srcstr.size();
3146         cl_int retval = 0;
3147
3148         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3149         if( handle && retval == CL_SUCCESS )
3150         {
3151             int i, n = (int)ctx.ndevices();
3152             AutoBuffer<void*> deviceListBuf(n+1);
3153             void** deviceList = deviceListBuf;
3154             for( i = 0; i < n; i++ )
3155                 deviceList[i] = ctx.device(i).ptr();
3156
3157             Device device = Device::getDefault();
3158             if (device.isAMD())
3159                 buildflags += " -D AMD_DEVICE";
3160             else if (device.isIntel())
3161                 buildflags += " -D INTEL_DEVICE";
3162
3163             retval = clBuildProgram(handle, n,
3164                                     (const cl_device_id*)deviceList,
3165                                     buildflags.c_str(), 0, 0);
3166 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3167             if( retval != CL_SUCCESS )
3168 #endif
3169             {
3170                 size_t retsz = 0;
3171                 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3172                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3173                 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3174                 {
3175                     AutoBuffer<char> bufbuf(retsz + 16);
3176                     char* buf = bufbuf;
3177                     buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3178                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3179                     if (buildInfo_retval == CL_SUCCESS)
3180                     {
3181                         // TODO It is useful to see kernel name & program file name also
3182                         errmsg = String(buf);
3183                         printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3184                         fflush(stdout);
3185                     }
3186                 }
3187                 if (retval != CL_SUCCESS && handle)
3188                 {
3189                     clReleaseProgram(handle);
3190                     handle = NULL;
3191                 }
3192             }
3193         }
3194     }
3195
3196     Impl(const String& _buf, const String& _buildflags)
3197     {
3198         refcount = 1;
3199         handle = 0;
3200         buildflags = _buildflags;
3201         if(_buf.empty())
3202             return;
3203         String prefix0 = Program::getPrefix(buildflags);
3204         const Context& ctx = Context::getDefault();
3205         const Device& dev = Device::getDefault();
3206         const char* pos0 = _buf.c_str();
3207         const char* pos1 = strchr(pos0, '\n');
3208         if(!pos1)
3209             return;
3210         const char* pos2 = strchr(pos1+1, '\n');
3211         if(!pos2)
3212             return;
3213         const char* pos3 = strchr(pos2+1, '\n');
3214         if(!pos3)
3215             return;
3216         size_t prefixlen = (pos3 - pos0)+1;
3217         String prefix(pos0, prefixlen);
3218         if( prefix != prefix0 )
3219             return;
3220         const uchar* bin = (uchar*)(pos3+1);
3221         void* devid = dev.ptr();
3222         size_t codelen = _buf.length() - prefixlen;
3223         cl_int binstatus = 0, retval = 0;
3224         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3225                                            &codelen, &bin, &binstatus, &retval);
3226         CV_OclDbgAssert(retval == CL_SUCCESS);
3227     }
3228
3229     String store()
3230     {
3231         if(!handle)
3232             return String();
3233         size_t progsz = 0, retsz = 0;
3234         String prefix = Program::getPrefix(buildflags);
3235         size_t prefixlen = prefix.length();
3236         if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3237             return String();
3238         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3239         uchar* buf = bufbuf;
3240         memcpy(buf, prefix.c_str(), prefixlen);
3241         buf += prefixlen;
3242         if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3243             return String();
3244         buf[progsz] = (uchar)'\0';
3245         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3246     }
3247
3248     ~Impl()
3249     {
3250         if( handle )
3251         {
3252 #ifdef _WIN32
3253             if (!cv::__termination)
3254 #endif
3255             {
3256                 clReleaseProgram(handle);
3257             }
3258             handle = NULL;
3259         }
3260     }
3261
3262     IMPLEMENT_REFCOUNTABLE();
3263
3264     ProgramSource src;
3265     String buildflags;
3266     cl_program handle;
3267 };
3268
3269
3270 Program::Program() { p = 0; }
3271
3272 Program::Program(const ProgramSource& src,
3273         const String& buildflags, String& errmsg)
3274 {
3275     p = 0;
3276     create(src, buildflags, errmsg);
3277 }
3278
3279 Program::Program(const Program& prog)
3280 {
3281     p = prog.p;
3282     if(p)
3283         p->addref();
3284 }
3285
3286 Program& Program::operator = (const Program& prog)
3287 {
3288     Impl* newp = (Impl*)prog.p;
3289     if(newp)
3290         newp->addref();
3291     if(p)
3292         p->release();
3293     p = newp;
3294     return *this;
3295 }
3296
3297 Program::~Program()
3298 {
3299     if(p)
3300         p->release();
3301 }
3302
3303 bool Program::create(const ProgramSource& src,
3304             const String& buildflags, String& errmsg)
3305 {
3306     if(p)
3307         p->release();
3308     p = new Impl(src, buildflags, errmsg);
3309     if(!p->handle)
3310     {
3311         p->release();
3312         p = 0;
3313     }
3314     return p != 0;
3315 }
3316
3317 const ProgramSource& Program::source() const
3318 {
3319     static ProgramSource dummy;
3320     return p ? p->src : dummy;
3321 }
3322
3323 void* Program::ptr() const
3324 {
3325     return p ? p->handle : 0;
3326 }
3327
3328 bool Program::read(const String& bin, const String& buildflags)
3329 {
3330     if(p)
3331         p->release();
3332     p = new Impl(bin, buildflags);
3333     return p->handle != 0;
3334 }
3335
3336 bool Program::write(String& bin) const
3337 {
3338     if(!p)
3339         return false;
3340     bin = p->store();
3341     return !bin.empty();
3342 }
3343
3344 String Program::getPrefix() const
3345 {
3346     if(!p)
3347         return String();
3348     return getPrefix(p->buildflags);
3349 }
3350
3351 String Program::getPrefix(const String& buildflags)
3352 {
3353     const Context& ctx = Context::getDefault();
3354     const Device& dev = ctx.device(0);
3355     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3356                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3357 }
3358
3359 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3360
3361 struct ProgramSource::Impl
3362 {
3363     Impl(const char* _src)
3364     {
3365         init(String(_src));
3366     }
3367     Impl(const String& _src)
3368     {
3369         init(_src);
3370     }
3371     void init(const String& _src)
3372     {
3373         refcount = 1;
3374         src = _src;
3375         h = crc64((uchar*)src.c_str(), src.size());
3376     }
3377
3378     IMPLEMENT_REFCOUNTABLE();
3379     String src;
3380     ProgramSource::hash_t h;
3381 };
3382
3383
3384 ProgramSource::ProgramSource()
3385 {
3386     p = 0;
3387 }
3388
3389 ProgramSource::ProgramSource(const char* prog)
3390 {
3391     p = new Impl(prog);
3392 }
3393
3394 ProgramSource::ProgramSource(const String& prog)
3395 {
3396     p = new Impl(prog);
3397 }
3398
3399 ProgramSource::~ProgramSource()
3400 {
3401     if(p)
3402         p->release();
3403 }
3404
3405 ProgramSource::ProgramSource(const ProgramSource& prog)
3406 {
3407     p = prog.p;
3408     if(p)
3409         p->addref();
3410 }
3411
3412 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3413 {
3414     Impl* newp = (Impl*)prog.p;
3415     if(newp)
3416         newp->addref();
3417     if(p)
3418         p->release();
3419     p = newp;
3420     return *this;
3421 }
3422
3423 const String& ProgramSource::source() const
3424 {
3425     static String dummy;
3426     return p ? p->src : dummy;
3427 }
3428
3429 ProgramSource::hash_t ProgramSource::hash() const
3430 {
3431     return p ? p->h : 0;
3432 }
3433
3434 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3435
3436 class OpenCLBufferPool
3437 {
3438 protected:
3439     ~OpenCLBufferPool() { }
3440 public:
3441     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity) = 0;
3442     virtual void release(cl_mem handle, size_t capacity) = 0;
3443 };
3444
3445 class OpenCLBufferPoolImpl : public BufferPoolController, public OpenCLBufferPool
3446 {
3447 public:
3448     struct BufferEntry
3449     {
3450         cl_mem clBuffer_;
3451         size_t capacity_;
3452     };
3453 protected:
3454     Mutex mutex_;
3455
3456     size_t currentReservedSize;
3457     size_t maxReservedSize;
3458
3459     std::list<BufferEntry> reservedEntries_; // LRU order
3460
3461     // synchronized
3462     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3463     {
3464         if (reservedEntries_.empty())
3465             return false;
3466         std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3467         std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3468         BufferEntry result = {NULL, 0};
3469         size_t minDiff = (size_t)(-1);
3470         for (; i != reservedEntries_.end(); ++i)
3471         {
3472             BufferEntry& e = *i;
3473             if (e.capacity_ >= size)
3474             {
3475                 size_t diff = e.capacity_ - size;
3476                 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3477                 {
3478                     minDiff = diff;
3479                     result_pos = i;
3480                     result = e;
3481                     if (diff == 0)
3482                         break;
3483                 }
3484             }
3485         }
3486         if (result_pos != reservedEntries_.end())
3487         {
3488             //CV_DbgAssert(result == *result_pos);
3489             reservedEntries_.erase(result_pos);
3490             entry = result;
3491             currentReservedSize -= entry.capacity_;
3492             return true;
3493         }
3494         return false;
3495     }
3496
3497     // synchronized
3498     void _checkSizeOfReservedEntries()
3499     {
3500         while (currentReservedSize > maxReservedSize)
3501         {
3502             CV_DbgAssert(!reservedEntries_.empty());
3503             const BufferEntry& entry = reservedEntries_.back();
3504             CV_DbgAssert(currentReservedSize >= entry.capacity_);
3505             currentReservedSize -= entry.capacity_;
3506             _releaseBufferEntry(entry);
3507             reservedEntries_.pop_back();
3508         }
3509     }
3510
3511     inline size_t _allocationGranularity(size_t size)
3512     {
3513         // heuristic values
3514         if (size < 1024)
3515             return 16;
3516         else if (size < 64*1024)
3517             return 64;
3518         else if (size < 1024*1024)
3519             return 4096;
3520         else if (size < 16*1024*1024)
3521             return 64*1024;
3522         else
3523             return 1024*1024;
3524     }
3525
3526     void _allocateBufferEntry(BufferEntry& entry, size_t size)
3527     {
3528         CV_DbgAssert(entry.clBuffer_ == NULL);
3529         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3530         Context& ctx = Context::getDefault();
3531         cl_int retval = CL_SUCCESS;
3532         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE, entry.capacity_, 0, &retval);
3533         CV_Assert(retval == CL_SUCCESS);
3534         CV_Assert(entry.clBuffer_ != NULL);
3535         if(retval == CL_SUCCESS)
3536         {
3537             CV_IMPL_ADD(CV_IMPL_OCL);
3538         }
3539         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3540                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3541     }
3542
3543     void _releaseBufferEntry(const BufferEntry& entry)
3544     {
3545         CV_Assert(entry.capacity_ != 0);
3546         CV_Assert(entry.clBuffer_ != NULL);
3547         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
3548                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
3549         clReleaseMemObject(entry.clBuffer_);
3550     }
3551 public:
3552     OpenCLBufferPoolImpl()
3553         : currentReservedSize(0), maxReservedSize(0)
3554     {
3555         int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
3556         maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
3557     }
3558     virtual ~OpenCLBufferPoolImpl()
3559     {
3560         freeAllReservedBuffers();
3561         CV_Assert(reservedEntries_.empty());
3562     }
3563 public:
3564     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity)
3565     {
3566         BufferEntry entry = {NULL, 0};
3567         if (maxReservedSize > 0)
3568         {
3569             AutoLock locker(mutex_);
3570             if (_findAndRemoveEntryFromReservedList(entry, size))
3571             {
3572                 CV_DbgAssert(size <= entry.capacity_);
3573                 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3574                 capacity = entry.capacity_;
3575                 return entry.clBuffer_;
3576             }
3577         }
3578         _allocateBufferEntry(entry, size);
3579         capacity = entry.capacity_;
3580         return entry.clBuffer_;
3581     }
3582     virtual void release(cl_mem handle, size_t capacity)
3583     {
3584         BufferEntry entry = {handle, capacity};
3585         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3586         {
3587             _releaseBufferEntry(entry);
3588         }
3589         else
3590         {
3591             AutoLock locker(mutex_);
3592             reservedEntries_.push_front(entry);
3593             currentReservedSize += entry.capacity_;
3594             _checkSizeOfReservedEntries();
3595         }
3596     }
3597
3598     virtual size_t getReservedSize() const { return currentReservedSize; }
3599     virtual size_t getMaxReservedSize() const { return maxReservedSize; }
3600     virtual void setMaxReservedSize(size_t size)
3601     {
3602         AutoLock locker(mutex_);
3603         size_t oldMaxReservedSize = maxReservedSize;
3604         maxReservedSize = size;
3605         if (maxReservedSize < oldMaxReservedSize)
3606         {
3607             std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3608             for (; i != reservedEntries_.end();)
3609             {
3610                 const BufferEntry& entry = *i;
3611                 if (entry.capacity_ > maxReservedSize / 8)
3612                 {
3613                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
3614                     currentReservedSize -= entry.capacity_;
3615                     _releaseBufferEntry(entry);
3616                     i = reservedEntries_.erase(i);
3617                     continue;
3618                 }
3619                 ++i;
3620             }
3621             _checkSizeOfReservedEntries();
3622         }
3623     }
3624     virtual void freeAllReservedBuffers()
3625     {
3626         AutoLock locker(mutex_);
3627         std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3628         for (; i != reservedEntries_.end(); ++i)
3629         {
3630             const BufferEntry& entry = *i;
3631             _releaseBufferEntry(entry);
3632         }
3633         reservedEntries_.clear();
3634     }
3635 };
3636
3637 #if defined _MSC_VER
3638 #pragma warning(disable:4127) // conditional expression is constant
3639 #endif
3640 template <bool readAccess, bool writeAccess>
3641 class AlignedDataPtr
3642 {
3643 protected:
3644     const size_t size_;
3645     uchar* const originPtr_;
3646     const size_t alignment_;
3647     uchar* ptr_;
3648     uchar* allocatedPtr_;
3649
3650 public:
3651     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
3652         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
3653     {
3654         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
3655         if (((size_t)ptr_ & (alignment - 1)) != 0)
3656         {
3657             allocatedPtr_ = new uchar[size_ + alignment - 1];
3658             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
3659             if (readAccess)
3660             {
3661                 memcpy(ptr_, originPtr_, size_);
3662             }
3663         }
3664     }
3665
3666     uchar* getAlignedPtr() const
3667     {
3668         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
3669         return ptr_;
3670     }
3671
3672     ~AlignedDataPtr()
3673     {
3674         if (allocatedPtr_)
3675         {
3676             if (writeAccess)
3677             {
3678                 memcpy(originPtr_, ptr_, size_);
3679             }
3680             delete[] allocatedPtr_;
3681             allocatedPtr_ = NULL;
3682         }
3683         ptr_ = NULL;
3684     }
3685 private:
3686     AlignedDataPtr(const AlignedDataPtr&); // disabled
3687     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
3688 };
3689 #if defined _MSC_VER
3690 #pragma warning(default:4127) // conditional expression is constant
3691 #endif
3692
3693 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
3694 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
3695 #endif
3696
3697 class OpenCLAllocator : public MatAllocator
3698 {
3699     mutable OpenCLBufferPoolImpl bufferPool;
3700     enum AllocatorFlags
3701     {
3702         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0
3703     };
3704 public:
3705     OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
3706
3707     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
3708             int flags, UMatUsageFlags usageFlags) const
3709     {
3710         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
3711         return u;
3712     }
3713
3714     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
3715     {
3716         const Device& dev = ctx.device(0);
3717         createFlags = 0;
3718         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
3719             createFlags |= CL_MEM_ALLOC_HOST_PTR;
3720
3721         if( dev.hostUnifiedMemory() )
3722             flags0 = 0;
3723         else
3724             flags0 = UMatData::COPY_ON_MAP;
3725     }
3726
3727     UMatData* allocate(int dims, const int* sizes, int type,
3728                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
3729     {
3730         if(!useOpenCL())
3731             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3732         CV_Assert(data == 0);
3733         size_t total = CV_ELEM_SIZE(type);
3734         for( int i = dims-1; i >= 0; i-- )
3735         {
3736             if( step )
3737                 step[i] = total;
3738             total *= sizes[i];
3739         }
3740
3741         Context& ctx = Context::getDefault();
3742         int createFlags = 0, flags0 = 0;
3743         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
3744
3745         size_t capacity = 0;
3746         void* handle = NULL;
3747         int allocatorFlags = 0;
3748         if (createFlags == 0)
3749         {
3750             handle = bufferPool.allocate(total, capacity);
3751             if (!handle)
3752                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3753             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
3754         }
3755         else
3756         {
3757             capacity = total;
3758             cl_int retval = 0;
3759             handle = clCreateBuffer((cl_context)ctx.ptr(),
3760                                           CL_MEM_READ_WRITE|createFlags, total, 0, &retval);
3761             if( !handle || retval != CL_SUCCESS )
3762                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3763             CV_IMPL_ADD(CV_IMPL_OCL)
3764         }
3765         UMatData* u = new UMatData(this);
3766         u->data = 0;
3767         u->size = total;
3768         u->capacity = capacity;
3769         u->handle = handle;
3770         u->flags = flags0;
3771         u->allocatorFlags_ = allocatorFlags;
3772         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
3773         return u;
3774     }
3775
3776     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
3777     {
3778         if(!u)
3779             return false;
3780
3781         UMatDataAutoLock lock(u);
3782
3783         if(u->handle == 0)
3784         {
3785             CV_Assert(u->origdata != 0);
3786             Context& ctx = Context::getDefault();
3787             int createFlags = 0, flags0 = 0;
3788             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
3789
3790             cl_context ctx_handle = (cl_context)ctx.ptr();
3791             cl_int retval = 0;
3792             int tempUMatFlags = UMatData::TEMP_UMAT;
3793             u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,
3794                                        u->size, u->origdata, &retval);
3795             if((!u->handle || retval != CL_SUCCESS) && !(accessFlags & ACCESS_FAST))
3796             {
3797                 u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
3798                                            u->size, u->origdata, &retval);
3799                 tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
3800
3801             }
3802             if(!u->handle || retval != CL_SUCCESS)
3803                 return false;
3804             u->prevAllocator = u->currAllocator;
3805             u->currAllocator = this;
3806             u->flags |= tempUMatFlags;
3807         }
3808         if(accessFlags & ACCESS_WRITE)
3809             u->markHostCopyObsolete(true);
3810         return true;
3811     }
3812
3813     /*void sync(UMatData* u) const
3814     {
3815         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3816         UMatDataAutoLock lock(u);
3817
3818         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
3819         {
3820             if( u->tempCopiedUMat() )
3821             {
3822                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3823                                     u->size, u->origdata, 0, 0, 0);
3824             }
3825             else
3826             {
3827                 cl_int retval = 0;
3828                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3829                                                 (CL_MAP_READ | CL_MAP_WRITE),
3830                                                 0, u->size, 0, 0, 0, &retval);
3831                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
3832                 clFinish(q);
3833             }
3834             u->markHostCopyObsolete(false);
3835         }
3836         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
3837         {
3838             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3839                                  u->size, u->data, 0, 0, 0);
3840         }
3841     }*/
3842
3843     void deallocate(UMatData* u) const
3844     {
3845         if(!u)
3846             return;
3847
3848         CV_Assert(u->urefcount >= 0);
3849         CV_Assert(u->refcount >= 0);
3850
3851         // TODO: !!! when we add Shared Virtual Memory Support,
3852         // this function (as well as the others) should be corrected
3853         CV_Assert(u->handle != 0 && u->urefcount == 0);
3854         if(u->tempUMat())
3855         {
3856 //            UMatDataAutoLock lock(u);
3857             if( u->hostCopyObsolete() && u->refcount > 0 )
3858             {
3859                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3860                 if( u->tempCopiedUMat() )
3861                 {
3862                     AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3863                     CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3864                                         u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
3865                 }
3866                 else
3867                 {
3868                     cl_int retval = 0;
3869                     void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3870                                                     (CL_MAP_READ | CL_MAP_WRITE),
3871                                                     0, u->size, 0, 0, 0, &retval);
3872                     CV_OclDbgAssert(retval == CL_SUCCESS);
3873                     CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
3874                     CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3875                 }
3876             }
3877             u->markHostCopyObsolete(false);
3878             clReleaseMemObject((cl_mem)u->handle);
3879             u->handle = 0;
3880             u->currAllocator = u->prevAllocator;
3881             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3882                 fastFree(u->data);
3883             u->data = u->origdata;
3884             if(u->refcount == 0)
3885                 u->currAllocator->deallocate(u);
3886         }
3887         else
3888         {
3889             CV_Assert(u->refcount == 0);
3890             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3891             {
3892                 fastFree(u->data);
3893                 u->data = 0;
3894             }
3895             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
3896             {
3897                 bufferPool.release((cl_mem)u->handle, u->capacity);
3898             }
3899             else
3900             {
3901                 clReleaseMemObject((cl_mem)u->handle);
3902             }
3903             u->handle = 0;
3904             u->capacity = 0;
3905             delete u;
3906         }
3907     }
3908
3909     void map(UMatData* u, int accessFlags) const
3910     {
3911         if(!u)
3912             return;
3913
3914         CV_Assert( u->handle != 0 );
3915
3916         UMatDataAutoLock autolock(u);
3917
3918         if(accessFlags & ACCESS_WRITE)
3919             u->markDeviceCopyObsolete(true);
3920
3921         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3922
3923         // FIXIT Workaround for UMat synchronization issue
3924         // if( u->refcount == 0 )
3925         {
3926             if( !u->copyOnMap() )
3927             {
3928                 if (u->data) // FIXIT Workaround for UMat synchronization issue
3929                 {
3930                     //CV_Assert(u->hostCopyObsolete() == false);
3931                     return;
3932                 }
3933                 // because there can be other map requests for the same UMat with different access flags,
3934                 // we use the universal (read-write) access mode.
3935                 cl_int retval = 0;
3936                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3937                                                      (CL_MAP_READ | CL_MAP_WRITE),
3938                                                      0, u->size, 0, 0, 0, &retval);
3939                 if(u->data && retval == CL_SUCCESS)
3940                 {
3941                     u->markHostCopyObsolete(false);
3942                     u->markDeviceMemMapped(true);
3943                     return;
3944                 }
3945
3946                 // if map failed, switch to copy-on-map mode for the particular buffer
3947                 u->flags |= UMatData::COPY_ON_MAP;
3948             }
3949
3950             if(!u->data)
3951             {
3952                 u->data = (uchar*)fastMalloc(u->size);
3953                 u->markHostCopyObsolete(true);
3954             }
3955         }
3956
3957         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
3958         {
3959             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3960             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3961                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
3962             u->markHostCopyObsolete(false);
3963         }
3964     }
3965
3966     void unmap(UMatData* u) const
3967     {
3968         if(!u)
3969             return;
3970
3971
3972         CV_Assert(u->handle != 0);
3973
3974         UMatDataAutoLock autolock(u);
3975
3976         // FIXIT Workaround for UMat synchronization issue
3977         if(u->refcount > 0)
3978             return;
3979
3980         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3981         cl_int retval = 0;
3982         if( !u->copyOnMap() && u->deviceMemMapped() )
3983         {
3984             CV_Assert(u->data != NULL);
3985             u->markDeviceMemMapped(false);
3986             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
3987                                 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
3988             if (Device::getDefault().isAMD())
3989             {
3990                 // required for multithreaded applications (see stitching test)
3991                 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3992             }
3993             u->data = 0;
3994         }
3995         else if( u->copyOnMap() && u->deviceCopyObsolete() )
3996         {
3997             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3998             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3999                                 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
4000         }
4001         u->markDeviceCopyObsolete(false);
4002         u->markHostCopyObsolete(false);
4003     }
4004
4005     bool checkContinuous(int dims, const size_t sz[],
4006                          const size_t srcofs[], const size_t srcstep[],
4007                          const size_t dstofs[], const size_t dststep[],
4008                          size_t& total, size_t new_sz[],
4009                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
4010                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
4011     {
4012         bool iscontinuous = true;
4013         srcrawofs = srcofs ? srcofs[dims-1] : 0;
4014         dstrawofs = dstofs ? dstofs[dims-1] : 0;
4015         total = sz[dims-1];
4016         for( int i = dims-2; i >= 0; i-- )
4017         {
4018             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
4019                 iscontinuous = false;
4020             total *= sz[i];
4021             if( srcofs )
4022                 srcrawofs += srcofs[i]*srcstep[i];
4023             if( dstofs )
4024                 dstrawofs += dstofs[i]*dststep[i];
4025         }
4026
4027         if( !iscontinuous )
4028         {
4029             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
4030             if( dims == 2 )
4031             {
4032                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
4033                 // we assume that new_... arrays are initialized by caller
4034                 // with 0's, so there is no else branch
4035                 if( srcofs )
4036                 {
4037                     new_srcofs[0] = srcofs[1];
4038                     new_srcofs[1] = srcofs[0];
4039                     new_srcofs[2] = 0;
4040                 }
4041
4042                 if( dstofs )
4043                 {
4044                     new_dstofs[0] = dstofs[1];
4045                     new_dstofs[1] = dstofs[0];
4046                     new_dstofs[2] = 0;
4047                 }
4048
4049                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
4050                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
4051             }
4052             else
4053             {
4054                 // we could check for dims == 3 here,
4055                 // but from user perspective this one is more informative
4056                 CV_Assert(dims <= 3);
4057                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
4058                 if( srcofs )
4059                 {
4060                     new_srcofs[0] = srcofs[2];
4061                     new_srcofs[1] = srcofs[1];
4062                     new_srcofs[2] = srcofs[0];
4063                 }
4064
4065                 if( dstofs )
4066                 {
4067                     new_dstofs[0] = dstofs[2];
4068                     new_dstofs[1] = dstofs[1];
4069                     new_dstofs[2] = dstofs[0];
4070                 }
4071
4072                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
4073                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
4074             }
4075         }
4076         return iscontinuous;
4077     }
4078
4079     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4080                   const size_t srcofs[], const size_t srcstep[],
4081                   const size_t dststep[]) const
4082     {
4083         if(!u)
4084             return;
4085         UMatDataAutoLock autolock(u);
4086
4087         if( u->data && !u->hostCopyObsolete() )
4088         {
4089             Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4090             return;
4091         }
4092         CV_Assert( u->handle != 0 );
4093
4094         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4095
4096         size_t total = 0, new_sz[] = {0, 0, 0};
4097         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4098         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4099
4100         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4101                                             total, new_sz,
4102                                             srcrawofs, new_srcofs, new_srcstep,
4103                                             dstrawofs, new_dstofs, new_dststep);
4104
4105         AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4106         if( iscontinuous )
4107         {
4108             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4109                                            srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4110         }
4111         else
4112         {
4113             CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4114                             new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4115                             new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4116         }
4117     }
4118
4119     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4120                 const size_t dstofs[], const size_t dststep[],
4121                 const size_t srcstep[]) const
4122     {
4123         if(!u)
4124             return;
4125
4126         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4127         CV_Assert(u->refcount == 0 || u->tempUMat());
4128
4129         size_t total = 0, new_sz[] = {0, 0, 0};
4130         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4131         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4132
4133         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4134                                             total, new_sz,
4135                                             srcrawofs, new_srcofs, new_srcstep,
4136                                             dstrawofs, new_dstofs, new_dststep);
4137
4138         UMatDataAutoLock autolock(u);
4139
4140         // if there is cached CPU copy of the GPU matrix,
4141         // we could use it as a destination.
4142         // we can do it in 2 cases:
4143         //    1. we overwrite the whole content
4144         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
4145         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4146         {
4147             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4148             u->markHostCopyObsolete(false);
4149             u->markDeviceCopyObsolete(true);
4150             return;
4151         }
4152
4153         CV_Assert( u->handle != 0 );
4154         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4155
4156         AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4157         if( iscontinuous )
4158         {
4159             CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4160                 CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) == CL_SUCCESS );
4161         }
4162         else
4163         {
4164             CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4165                 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4166                 new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) == CL_SUCCESS );
4167         }
4168
4169         u->markHostCopyObsolete(true);
4170         u->markDeviceCopyObsolete(false);
4171     }
4172
4173     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4174               const size_t srcofs[], const size_t srcstep[],
4175               const size_t dstofs[], const size_t dststep[], bool _sync) const
4176     {
4177         if(!src || !dst)
4178             return;
4179
4180         size_t total = 0, new_sz[] = {0, 0, 0};
4181         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4182         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4183
4184         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4185                                             total, new_sz,
4186                                             srcrawofs, new_srcofs, new_srcstep,
4187                                             dstrawofs, new_dstofs, new_dststep);
4188
4189         UMatDataAutoLock src_autolock(src);
4190         UMatDataAutoLock dst_autolock(dst);
4191
4192         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
4193         {
4194             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
4195             return;
4196         }
4197         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
4198         {
4199             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
4200             dst->markHostCopyObsolete(false);
4201             dst->markDeviceCopyObsolete(true);
4202             return;
4203         }
4204
4205         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4206         CV_Assert(dst->refcount == 0);
4207         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4208
4209         cl_int retval;
4210         if( iscontinuous )
4211         {
4212             CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4213                                            srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS );
4214         }
4215         else
4216         {
4217             CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4218                                                new_srcofs, new_dstofs, new_sz,
4219                                                new_srcstep[0], new_srcstep[1],
4220                                                new_dststep[0], new_dststep[1],
4221                                                0, 0, 0)) == CL_SUCCESS );
4222         }
4223         if(retval == CL_SUCCESS)
4224         {
4225             CV_IMPL_ADD(CV_IMPL_OCL)
4226         }
4227
4228         dst->markHostCopyObsolete(true);
4229         dst->markDeviceCopyObsolete(false);
4230
4231         if( _sync )
4232         {
4233             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4234         }
4235     }
4236
4237     BufferPoolController* getBufferPoolController() const { return &bufferPool; }
4238
4239     MatAllocator* matStdAllocator;
4240 };
4241
4242 MatAllocator* getOpenCLAllocator()
4243 {
4244     static MatAllocator * allocator = new OpenCLAllocator();
4245     return allocator;
4246 }
4247
4248 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
4249
4250 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
4251 {
4252     cl_uint numDevices = 0;
4253     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4254                                 0, NULL, &numDevices) == CL_SUCCESS);
4255
4256     if (numDevices == 0)
4257     {
4258         devices.clear();
4259         return;
4260     }
4261
4262     devices.resize((size_t)numDevices);
4263     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4264                                 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
4265 }
4266
4267 struct PlatformInfo::Impl
4268 {
4269     Impl(void* id)
4270     {
4271         refcount = 1;
4272         handle = *(cl_platform_id*)id;
4273         getDevices(devices, handle);
4274     }
4275
4276     String getStrProp(cl_device_info prop) const
4277     {
4278         char buf[1024];
4279         size_t sz=0;
4280         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
4281             sz < sizeof(buf) ? String(buf) : String();
4282     }
4283
4284     IMPLEMENT_REFCOUNTABLE();
4285     std::vector<cl_device_id> devices;
4286     cl_platform_id handle;
4287 };
4288
4289 PlatformInfo::PlatformInfo()
4290 {
4291     p = 0;
4292 }
4293
4294 PlatformInfo::PlatformInfo(void* platform_id)
4295 {
4296     p = new Impl(platform_id);
4297 }
4298
4299 PlatformInfo::~PlatformInfo()
4300 {
4301     if(p)
4302         p->release();
4303 }
4304
4305 PlatformInfo::PlatformInfo(const PlatformInfo& i)
4306 {
4307     if (i.p)
4308         i.p->addref();
4309     p = i.p;
4310 }
4311
4312 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
4313 {
4314     if (i.p != p)
4315     {
4316         if (i.p)
4317             i.p->addref();
4318         if (p)
4319             p->release();
4320         p = i.p;
4321     }
4322     return *this;
4323 }
4324
4325 int PlatformInfo::deviceNumber() const
4326 {
4327     return p ? (int)p->devices.size() : 0;
4328 }
4329
4330 void PlatformInfo::getDevice(Device& device, int d) const
4331 {
4332     CV_Assert(p && d < (int)p->devices.size() );
4333     if(p)
4334         device.set(p->devices[d]);
4335 }
4336
4337 String PlatformInfo::name() const
4338 {
4339     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
4340 }
4341
4342 String PlatformInfo::vendor() const
4343 {
4344     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
4345 }
4346
4347 String PlatformInfo::version() const
4348 {
4349     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
4350 }
4351
4352 static void getPlatforms(std::vector<cl_platform_id>& platforms)
4353 {
4354     cl_uint numPlatforms = 0;
4355     CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
4356
4357     if (numPlatforms == 0)
4358     {
4359         platforms.clear();
4360         return;
4361     }
4362
4363     platforms.resize((size_t)numPlatforms);
4364     CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
4365 }
4366
4367 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
4368 {
4369     std::vector<cl_platform_id> platforms;
4370     getPlatforms(platforms);
4371
4372     for (size_t i = 0; i < platforms.size(); i++)
4373         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
4374 }
4375
4376 const char* typeToStr(int type)
4377 {
4378     static const char* tab[]=
4379     {
4380         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4381         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4382         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4383         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4384         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4385         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
4386         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
4387         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4388     };
4389     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4390     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4391 }
4392
4393 const char* memopTypeToStr(int type)
4394 {
4395     static const char* tab[] =
4396     {
4397         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4398         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4399         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4400         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4401         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4402         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4403         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
4404         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4405     };
4406     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4407     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4408 }
4409
4410 const char* vecopTypeToStr(int type)
4411 {
4412     static const char* tab[] =
4413     {
4414         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
4415         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
4416         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
4417         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
4418         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4419         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4420         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
4421         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4422     };
4423     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4424     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4425 }
4426
4427 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
4428 {
4429     if( sdepth == ddepth )
4430         return "noconvert";
4431     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
4432     if( ddepth >= CV_32F ||
4433         (ddepth == CV_32S && sdepth < CV_32S) ||
4434         (ddepth == CV_16S && sdepth <= CV_8S) ||
4435         (ddepth == CV_16U && sdepth == CV_8U))
4436     {
4437         sprintf(buf, "convert_%s", typestr);
4438     }
4439     else if( sdepth >= CV_32F )
4440         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
4441     else
4442         sprintf(buf, "convert_%s_sat", typestr);
4443
4444     return buf;
4445 }
4446
4447 template <typename T>
4448 static std::string kerToStr(const Mat & k)
4449 {
4450     int width = k.cols - 1, depth = k.depth();
4451     const T * const data = k.ptr<T>();
4452
4453     std::ostringstream stream;
4454     stream.precision(10);
4455
4456     if (depth <= CV_8S)
4457     {
4458         for (int i = 0; i < width; ++i)
4459             stream << "DIG(" << (int)data[i] << ")";
4460         stream << "DIG(" << (int)data[width] << ")";
4461     }
4462     else if (depth == CV_32F)
4463     {
4464         stream.setf(std::ios_base::showpoint);
4465         for (int i = 0; i < width; ++i)
4466             stream << "DIG(" << data[i] << "f)";
4467         stream << "DIG(" << data[width] << "f)";
4468     }
4469     else
4470     {
4471         for (int i = 0; i < width; ++i)
4472             stream << "DIG(" << data[i] << ")";
4473         stream << "DIG(" << data[width] << ")";
4474     }
4475
4476     return stream.str();
4477 }
4478
4479 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
4480 {
4481     Mat kernel = _kernel.getMat().reshape(1, 1);
4482
4483     int depth = kernel.depth();
4484     if (ddepth < 0)
4485         ddepth = depth;
4486
4487     if (ddepth != depth)
4488         kernel.convertTo(kernel, ddepth);
4489
4490     typedef std::string (* func_t)(const Mat &);
4491     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
4492                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
4493     const func_t func = funcs[ddepth];
4494     CV_Assert(func != 0);
4495
4496     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
4497 }
4498
4499 #define PROCESS_SRC(src) \
4500     do \
4501     { \
4502         if (!src.empty()) \
4503         { \
4504             CV_Assert(src.isMat() || src.isUMat()); \
4505             Size csize = src.size(); \
4506             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
4507                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
4508             if (cwidth < ckercn || ckercn <= 0) \
4509                 return 1; \
4510             cols.push_back(cwidth); \
4511             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
4512                 return 1; \
4513             offsets.push_back(src.offset()); \
4514             steps.push_back(src.step()); \
4515             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
4516             kercns.push_back(ckercn); \
4517         } \
4518     } \
4519     while ((void)0, 0)
4520
4521 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
4522                               InputArray src4, InputArray src5, InputArray src6,
4523                               InputArray src7, InputArray src8, InputArray src9,
4524                               OclVectorStrategy strat)
4525 {
4526     const ocl::Device & d = ocl::Device::getDefault();
4527
4528     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
4529         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
4530         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
4531         d.preferredVectorWidthDouble(), -1 };
4532
4533     // if the device says don't use vectors
4534     if (vectorWidths[0] == 1)
4535     {
4536         // it's heuristic
4537         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
4538         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
4539         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
4540     }
4541
4542     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
4543 }
4544
4545 int checkOptimalVectorWidth(const int *vectorWidths,
4546                             InputArray src1, InputArray src2, InputArray src3,
4547                             InputArray src4, InputArray src5, InputArray src6,
4548                             InputArray src7, InputArray src8, InputArray src9,
4549                             OclVectorStrategy strat)
4550 {
4551     CV_Assert(vectorWidths);
4552
4553     int ref_type = src1.type();
4554
4555     std::vector<size_t> offsets, steps, cols;
4556     std::vector<int> dividers, kercns;
4557     PROCESS_SRC(src1);
4558     PROCESS_SRC(src2);
4559     PROCESS_SRC(src3);
4560     PROCESS_SRC(src4);
4561     PROCESS_SRC(src5);
4562     PROCESS_SRC(src6);
4563     PROCESS_SRC(src7);
4564     PROCESS_SRC(src8);
4565     PROCESS_SRC(src9);
4566
4567     size_t size = offsets.size();
4568
4569     for (size_t i = 0; i < size; ++i)
4570         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
4571             dividers[i] >>= 1, kercns[i] >>= 1;
4572
4573     // default strategy
4574     int kercn = *std::min_element(kercns.begin(), kercns.end());
4575
4576     return kercn;
4577 }
4578
4579 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
4580                                  InputArray src4, InputArray src5, InputArray src6,
4581                                  InputArray src7, InputArray src8, InputArray src9)
4582 {
4583     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
4584 }
4585
4586 #undef PROCESS_SRC
4587
4588
4589 // TODO Make this as a method of OpenCL "BuildOptions" class
4590 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
4591 {
4592     if (!buildOptions.empty())
4593         buildOptions += " ";
4594     int type = _m.type(), depth = CV_MAT_DEPTH(type);
4595     buildOptions += format(
4596             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
4597             name.c_str(), ocl::typeToStr(type),
4598             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
4599             name.c_str(), (int)CV_MAT_CN(type),
4600             name.c_str(), (int)CV_ELEM_SIZE(type),
4601             name.c_str(), (int)CV_ELEM_SIZE1(type),
4602             name.c_str(), (int)depth
4603             );
4604 }
4605
4606
4607 struct Image2D::Impl
4608 {
4609     Impl(const UMat &src, bool norm, bool alias)
4610     {
4611         handle = 0;
4612         refcount = 1;
4613         init(src, norm, alias);
4614     }
4615
4616     ~Impl()
4617     {
4618         if (handle)
4619             clReleaseMemObject(handle);
4620     }
4621
4622     static cl_image_format getImageFormat(int depth, int cn, bool norm)
4623     {
4624         cl_image_format format;
4625         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
4626                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
4627         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
4628                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
4629         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
4630
4631         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
4632         int channelOrder = channelOrders[cn];
4633         format.image_channel_data_type = (cl_channel_type)channelType;
4634         format.image_channel_order = (cl_channel_order)channelOrder;
4635         return format;
4636     }
4637
4638     static bool isFormatSupported(cl_image_format format)
4639     {
4640         if (!haveOpenCL())
4641             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
4642
4643         cl_context context = (cl_context)Context::getDefault().ptr();
4644         // Figure out how many formats are supported by this context.
4645         cl_uint numFormats = 0;
4646         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4647                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
4648                                                 NULL, &numFormats);
4649         AutoBuffer<cl_image_format> formats(numFormats);
4650         err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4651                                          CL_MEM_OBJECT_IMAGE2D, numFormats,
4652                                          formats, NULL);
4653         CV_OclDbgAssert(err == CL_SUCCESS);
4654         for (cl_uint i = 0; i < numFormats; ++i)
4655         {
4656             if (!memcmp(&formats[i], &format, sizeof(format)))
4657             {
4658                 return true;
4659             }
4660         }
4661         return false;
4662     }
4663
4664     void init(const UMat &src, bool norm, bool alias)
4665     {
4666         if (!haveOpenCL())
4667             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
4668
4669         CV_Assert(!src.empty());
4670         CV_Assert(ocl::Device::getDefault().imageSupport());
4671
4672         int err, depth = src.depth(), cn = src.channels();
4673         CV_Assert(cn <= 4);
4674         cl_image_format format = getImageFormat(depth, cn, norm);
4675
4676         if (!isFormatSupported(format))
4677             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
4678
4679         if (alias && !src.handle(ACCESS_RW))
4680             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
4681
4682         cl_context context = (cl_context)Context::getDefault().ptr();
4683         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
4684
4685 #ifdef CL_VERSION_1_2
4686         // this enables backwards portability to
4687         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
4688         const Device & d = ocl::Device::getDefault();
4689         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
4690         CV_Assert(!alias || canCreateAlias(src));
4691         if (1 < major || (1 == major && 2 <= minor))
4692         {
4693             cl_image_desc desc;
4694             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
4695             desc.image_width      = src.cols;
4696             desc.image_height     = src.rows;
4697             desc.image_depth      = 0;
4698             desc.image_array_size = 1;
4699             desc.image_row_pitch  = alias ? src.step[0] : 0;
4700             desc.image_slice_pitch = 0;
4701             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
4702             desc.num_mip_levels   = 0;
4703             desc.num_samples      = 0;
4704             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
4705         }
4706         else
4707 #endif
4708         {
4709             CV_SUPPRESS_DEPRECATED_START
4710             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
4711             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
4712             CV_SUPPRESS_DEPRECATED_END
4713         }
4714         CV_OclDbgAssert(err == CL_SUCCESS);
4715
4716         size_t origin[] = { 0, 0, 0 };
4717         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
4718
4719         cl_mem devData;
4720         if (!alias && !src.isContinuous())
4721         {
4722             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
4723             CV_OclDbgAssert(err == CL_SUCCESS);
4724
4725             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
4726             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
4727                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
4728             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4729         }
4730         else
4731         {
4732             devData = (cl_mem)src.handle(ACCESS_READ);
4733         }
4734         CV_Assert(devData != NULL);
4735
4736         if (!alias)
4737         {
4738             CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
4739             if (!src.isContinuous())
4740             {
4741                 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4742                 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
4743             }
4744         }
4745     }
4746
4747     IMPLEMENT_REFCOUNTABLE();
4748
4749     cl_mem handle;
4750 };
4751
4752 Image2D::Image2D()
4753 {
4754     p = NULL;
4755 }
4756
4757 Image2D::Image2D(const UMat &src, bool norm, bool alias)
4758 {
4759     p = new Impl(src, norm, alias);
4760 }
4761
4762 bool Image2D::canCreateAlias(const UMat &m)
4763 {
4764     bool ret = false;
4765     const Device & d = ocl::Device::getDefault();
4766     if (d.imageFromBufferSupport() && !m.empty())
4767     {
4768         // This is the required pitch alignment in pixels
4769         uint pitchAlign = d.imagePitchAlignment();
4770         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
4771         {
4772             // We don't currently handle the case where the buffer was created
4773             // with CL_MEM_USE_HOST_PTR
4774             if (!m.u->tempUMat())
4775             {
4776                 ret = true;
4777             }
4778         }
4779     }
4780     return ret;
4781 }
4782
4783 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
4784 {
4785     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
4786
4787     return Impl::isFormatSupported(format);
4788 }
4789
4790 Image2D::Image2D(const Image2D & i)
4791 {
4792     p = i.p;
4793     if (p)
4794         p->addref();
4795 }
4796
4797 Image2D & Image2D::operator = (const Image2D & i)
4798 {
4799     if (i.p != p)
4800     {
4801         if (i.p)
4802             i.p->addref();
4803         if (p)
4804             p->release();
4805         p = i.p;
4806     }
4807     return *this;
4808 }
4809
4810 Image2D::~Image2D()
4811 {
4812     if (p)
4813         p->release();
4814 }
4815
4816 void* Image2D::ptr() const
4817 {
4818     return p ? p->handle : 0;
4819 }
4820
4821 bool isPerformanceCheckBypassed()
4822 {
4823     static bool initialized = false;
4824     static bool value = false;
4825     if (!initialized)
4826     {
4827         value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
4828         initialized = true;
4829     }
4830     return value;
4831 }
4832
4833 }}