Merge pull request #2781 from ilya-lavrenov:tapi_meanstddev
[profile/ivi/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <sstream>
47 #include <iostream> // std::cerr
48
49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50
51 #include "opencv2/core/bufferpool.hpp"
52 #ifndef LOG_BUFFER_POOL
53 # if 0
54 #   define LOG_BUFFER_POOL printf
55 # else
56 #   define LOG_BUFFER_POOL(...)
57 # endif
58 #endif
59
60 // TODO Move to some common place
61 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
62 {
63 #ifdef HAVE_WINRT
64     const char* envValue = NULL;
65 #else
66     const char* envValue = getenv(name);
67 #endif
68     if (envValue == NULL)
69     {
70         return defaultValue;
71     }
72     cv::String value = envValue;
73     size_t pos = 0;
74     for (; pos < value.size(); pos++)
75     {
76         if (!isdigit(value[pos]))
77             break;
78     }
79     cv::String valueStr = value.substr(0, pos);
80     cv::String suffixStr = value.substr(pos, value.length() - pos);
81     int v = atoi(valueStr.c_str());
82     if (suffixStr.length() == 0)
83         return v;
84     else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
85         return v * 1024 * 1024;
86     else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
87         return v * 1024;
88     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
89 }
90
91 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
92 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
93
94 #ifdef HAVE_OPENCL
95 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
96 #else
97 // TODO FIXIT: This file can't be build without OPENCL
98
99 /*
100   Part of the file is an extract from the standard OpenCL headers from Khronos site.
101   Below is the original copyright.
102 */
103
104 /*******************************************************************************
105  * Copyright (c) 2008 - 2012 The Khronos Group Inc.
106  *
107  * Permission is hereby granted, free of charge, to any person obtaining a
108  * copy of this software and/or associated documentation files (the
109  * "Materials"), to deal in the Materials without restriction, including
110  * without limitation the rights to use, copy, modify, merge, publish,
111  * distribute, sublicense, and/or sell copies of the Materials, and to
112  * permit persons to whom the Materials are furnished to do so, subject to
113  * the following conditions:
114  *
115  * The above copyright notice and this permission notice shall be included
116  * in all copies or substantial portions of the Materials.
117  *
118  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
119  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
120  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
121  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
122  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
123  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
124  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
125  ******************************************************************************/
126
127 #if 0 //defined __APPLE__
128 #define HAVE_OPENCL 1
129 #else
130 #undef HAVE_OPENCL
131 #endif
132
133 #define OPENCV_CL_NOT_IMPLEMENTED -1000
134
135 #ifdef HAVE_OPENCL
136
137 #if defined __APPLE__
138 #include <OpenCL/opencl.h>
139 #else
140 #include <CL/opencl.h>
141 #endif
142
143 static const bool g_haveOpenCL = true;
144
145 #else
146
147 extern "C" {
148
149 struct _cl_platform_id { int dummy; };
150 struct _cl_device_id { int dummy; };
151 struct _cl_context { int dummy; };
152 struct _cl_command_queue { int dummy; };
153 struct _cl_mem { int dummy; };
154 struct _cl_program { int dummy; };
155 struct _cl_kernel { int dummy; };
156 struct _cl_event { int dummy; };
157 struct _cl_sampler { int dummy; };
158
159 typedef struct _cl_platform_id *    cl_platform_id;
160 typedef struct _cl_device_id *      cl_device_id;
161 typedef struct _cl_context *        cl_context;
162 typedef struct _cl_command_queue *  cl_command_queue;
163 typedef struct _cl_mem *            cl_mem;
164 typedef struct _cl_program *        cl_program;
165 typedef struct _cl_kernel *         cl_kernel;
166 typedef struct _cl_event *          cl_event;
167 typedef struct _cl_sampler *        cl_sampler;
168
169 typedef int cl_int;
170 typedef unsigned cl_uint;
171 #if defined (_WIN32) && defined(_MSC_VER)
172     typedef __int64 cl_long;
173     typedef unsigned __int64 cl_ulong;
174 #else
175     typedef long cl_long;
176     typedef unsigned long cl_ulong;
177 #endif
178
179 typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
180 typedef cl_ulong            cl_bitfield;
181 typedef cl_bitfield         cl_device_type;
182 typedef cl_uint             cl_platform_info;
183 typedef cl_uint             cl_device_info;
184 typedef cl_bitfield         cl_device_fp_config;
185 typedef cl_uint             cl_device_mem_cache_type;
186 typedef cl_uint             cl_device_local_mem_type;
187 typedef cl_bitfield         cl_device_exec_capabilities;
188 typedef cl_bitfield         cl_command_queue_properties;
189 typedef intptr_t            cl_device_partition_property;
190 typedef cl_bitfield         cl_device_affinity_domain;
191
192 typedef intptr_t            cl_context_properties;
193 typedef cl_uint             cl_context_info;
194 typedef cl_uint             cl_command_queue_info;
195 typedef cl_uint             cl_channel_order;
196 typedef cl_uint             cl_channel_type;
197 typedef cl_bitfield         cl_mem_flags;
198 typedef cl_uint             cl_mem_object_type;
199 typedef cl_uint             cl_mem_info;
200 typedef cl_bitfield         cl_mem_migration_flags;
201 typedef cl_uint             cl_image_info;
202 typedef cl_uint             cl_buffer_create_type;
203 typedef cl_uint             cl_addressing_mode;
204 typedef cl_uint             cl_filter_mode;
205 typedef cl_uint             cl_sampler_info;
206 typedef cl_bitfield         cl_map_flags;
207 typedef cl_uint             cl_program_info;
208 typedef cl_uint             cl_program_build_info;
209 typedef cl_uint             cl_program_binary_type;
210 typedef cl_int              cl_build_status;
211 typedef cl_uint             cl_kernel_info;
212 typedef cl_uint             cl_kernel_arg_info;
213 typedef cl_uint             cl_kernel_arg_address_qualifier;
214 typedef cl_uint             cl_kernel_arg_access_qualifier;
215 typedef cl_bitfield         cl_kernel_arg_type_qualifier;
216 typedef cl_uint             cl_kernel_work_group_info;
217 typedef cl_uint             cl_event_info;
218 typedef cl_uint             cl_command_type;
219 typedef cl_uint             cl_profiling_info;
220
221
222 typedef struct _cl_image_format {
223     cl_channel_order        image_channel_order;
224     cl_channel_type         image_channel_data_type;
225 } cl_image_format;
226
227 typedef struct _cl_image_desc {
228     cl_mem_object_type      image_type;
229     size_t                  image_width;
230     size_t                  image_height;
231     size_t                  image_depth;
232     size_t                  image_array_size;
233     size_t                  image_row_pitch;
234     size_t                  image_slice_pitch;
235     cl_uint                 num_mip_levels;
236     cl_uint                 num_samples;
237     cl_mem                  buffer;
238 } cl_image_desc;
239
240 typedef struct _cl_buffer_region {
241     size_t                  origin;
242     size_t                  size;
243 } cl_buffer_region;
244
245
246 //////////////////////////////////////////////////////////
247
248 #define CL_SUCCESS                                  0
249 #define CL_DEVICE_NOT_FOUND                         -1
250 #define CL_DEVICE_NOT_AVAILABLE                     -2
251 #define CL_COMPILER_NOT_AVAILABLE                   -3
252 #define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
253 #define CL_OUT_OF_RESOURCES                         -5
254 #define CL_OUT_OF_HOST_MEMORY                       -6
255 #define CL_PROFILING_INFO_NOT_AVAILABLE             -7
256 #define CL_MEM_COPY_OVERLAP                         -8
257 #define CL_IMAGE_FORMAT_MISMATCH                    -9
258 #define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
259 #define CL_BUILD_PROGRAM_FAILURE                    -11
260 #define CL_MAP_FAILURE                              -12
261 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
262 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
263 #define CL_COMPILE_PROGRAM_FAILURE                  -15
264 #define CL_LINKER_NOT_AVAILABLE                     -16
265 #define CL_LINK_PROGRAM_FAILURE                     -17
266 #define CL_DEVICE_PARTITION_FAILED                  -18
267 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
268
269 #define CL_INVALID_VALUE                            -30
270 #define CL_INVALID_DEVICE_TYPE                      -31
271 #define CL_INVALID_PLATFORM                         -32
272 #define CL_INVALID_DEVICE                           -33
273 #define CL_INVALID_CONTEXT                          -34
274 #define CL_INVALID_QUEUE_PROPERTIES                 -35
275 #define CL_INVALID_COMMAND_QUEUE                    -36
276 #define CL_INVALID_HOST_PTR                         -37
277 #define CL_INVALID_MEM_OBJECT                       -38
278 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
279 #define CL_INVALID_IMAGE_SIZE                       -40
280 #define CL_INVALID_SAMPLER                          -41
281 #define CL_INVALID_BINARY                           -42
282 #define CL_INVALID_BUILD_OPTIONS                    -43
283 #define CL_INVALID_PROGRAM                          -44
284 #define CL_INVALID_PROGRAM_EXECUTABLE               -45
285 #define CL_INVALID_KERNEL_NAME                      -46
286 #define CL_INVALID_KERNEL_DEFINITION                -47
287 #define CL_INVALID_KERNEL                           -48
288 #define CL_INVALID_ARG_INDEX                        -49
289 #define CL_INVALID_ARG_VALUE                        -50
290 #define CL_INVALID_ARG_SIZE                         -51
291 #define CL_INVALID_KERNEL_ARGS                      -52
292 #define CL_INVALID_WORK_DIMENSION                   -53
293 #define CL_INVALID_WORK_GROUP_SIZE                  -54
294 #define CL_INVALID_WORK_ITEM_SIZE                   -55
295 #define CL_INVALID_GLOBAL_OFFSET                    -56
296 #define CL_INVALID_EVENT_WAIT_LIST                  -57
297 #define CL_INVALID_EVENT                            -58
298 #define CL_INVALID_OPERATION                        -59
299 #define CL_INVALID_GL_OBJECT                        -60
300 #define CL_INVALID_BUFFER_SIZE                      -61
301 #define CL_INVALID_MIP_LEVEL                        -62
302 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
303 #define CL_INVALID_PROPERTY                         -64
304 #define CL_INVALID_IMAGE_DESCRIPTOR                 -65
305 #define CL_INVALID_COMPILER_OPTIONS                 -66
306 #define CL_INVALID_LINKER_OPTIONS                   -67
307 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
308
309 /*#define CL_VERSION_1_0                              1
310 #define CL_VERSION_1_1                              1
311 #define CL_VERSION_1_2                              1*/
312
313 #define CL_FALSE                                    0
314 #define CL_TRUE                                     1
315 #define CL_BLOCKING                                 CL_TRUE
316 #define CL_NON_BLOCKING                             CL_FALSE
317
318 #define CL_PLATFORM_PROFILE                         0x0900
319 #define CL_PLATFORM_VERSION                         0x0901
320 #define CL_PLATFORM_NAME                            0x0902
321 #define CL_PLATFORM_VENDOR                          0x0903
322 #define CL_PLATFORM_EXTENSIONS                      0x0904
323
324 #define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
325 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
326 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
327 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
328 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
329 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
330 #define CL_DEVICE_TYPE                              0x1000
331 #define CL_DEVICE_VENDOR_ID                         0x1001
332 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
333 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
334 #define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
335 #define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
336 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
337 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
338 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
339 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
340 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
341 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
342 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
343 #define CL_DEVICE_ADDRESS_BITS                      0x100D
344 #define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
345 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
346 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
347 #define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
348 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
349 #define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
350 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
351 #define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
352 #define CL_DEVICE_IMAGE_SUPPORT                     0x1016
353 #define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
354 #define CL_DEVICE_MAX_SAMPLERS                      0x1018
355 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
356 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
357 #define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
358 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
359 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
360 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
361 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
362 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
363 #define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
364 #define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
365 #define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
366 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
367 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
368 #define CL_DEVICE_ENDIAN_LITTLE                     0x1026
369 #define CL_DEVICE_AVAILABLE                         0x1027
370 #define CL_DEVICE_COMPILER_AVAILABLE                0x1028
371 #define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
372 #define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
373 #define CL_DEVICE_NAME                              0x102B
374 #define CL_DEVICE_VENDOR                            0x102C
375 #define CL_DRIVER_VERSION                           0x102D
376 #define CL_DEVICE_PROFILE                           0x102E
377 #define CL_DEVICE_VERSION                           0x102F
378 #define CL_DEVICE_EXTENSIONS                        0x1030
379 #define CL_DEVICE_PLATFORM                          0x1031
380 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
381 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
382 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
383 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
384 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
385 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
386 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
387 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
388 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
389 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
390 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
391 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
392 #define CL_DEVICE_LINKER_AVAILABLE                  0x103E
393 #define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
394 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
395 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
396 #define CL_DEVICE_PARENT_DEVICE                     0x1042
397 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
398 #define CL_DEVICE_PARTITION_PROPERTIES              0x1044
399 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
400 #define CL_DEVICE_PARTITION_TYPE                    0x1046
401 #define CL_DEVICE_REFERENCE_COUNT                   0x1047
402 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
403 #define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
404 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
405 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
406
407 #define CL_FP_DENORM                                (1 << 0)
408 #define CL_FP_INF_NAN                               (1 << 1)
409 #define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
410 #define CL_FP_ROUND_TO_ZERO                         (1 << 3)
411 #define CL_FP_ROUND_TO_INF                          (1 << 4)
412 #define CL_FP_FMA                                   (1 << 5)
413 #define CL_FP_SOFT_FLOAT                            (1 << 6)
414 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
415
416 #define CL_NONE                                     0x0
417 #define CL_READ_ONLY_CACHE                          0x1
418 #define CL_READ_WRITE_CACHE                         0x2
419 #define CL_LOCAL                                    0x1
420 #define CL_GLOBAL                                   0x2
421 #define CL_EXEC_KERNEL                              (1 << 0)
422 #define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
423 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
424 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
425
426 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
427 #define CL_CONTEXT_DEVICES                          0x1081
428 #define CL_CONTEXT_PROPERTIES                       0x1082
429 #define CL_CONTEXT_NUM_DEVICES                      0x1083
430 #define CL_CONTEXT_PLATFORM                         0x1084
431 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
432
433 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
434 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
435 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
436 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
437 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
438 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
439 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
440 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
441 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
442 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
443 #define CL_QUEUE_CONTEXT                            0x1090
444 #define CL_QUEUE_DEVICE                             0x1091
445 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
446 #define CL_QUEUE_PROPERTIES                         0x1093
447 #define CL_MEM_READ_WRITE                           (1 << 0)
448 #define CL_MEM_WRITE_ONLY                           (1 << 1)
449 #define CL_MEM_READ_ONLY                            (1 << 2)
450 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
451 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
452 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
453 // reserved                                         (1 << 6)
454 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
455 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
456 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
457 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
458 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
459
460 #define CL_R                                        0x10B0
461 #define CL_A                                        0x10B1
462 #define CL_RG                                       0x10B2
463 #define CL_RA                                       0x10B3
464 #define CL_RGB                                      0x10B4
465 #define CL_RGBA                                     0x10B5
466 #define CL_BGRA                                     0x10B6
467 #define CL_ARGB                                     0x10B7
468 #define CL_INTENSITY                                0x10B8
469 #define CL_LUMINANCE                                0x10B9
470 #define CL_Rx                                       0x10BA
471 #define CL_RGx                                      0x10BB
472 #define CL_RGBx                                     0x10BC
473 #define CL_DEPTH                                    0x10BD
474 #define CL_DEPTH_STENCIL                            0x10BE
475
476 #define CL_SNORM_INT8                               0x10D0
477 #define CL_SNORM_INT16                              0x10D1
478 #define CL_UNORM_INT8                               0x10D2
479 #define CL_UNORM_INT16                              0x10D3
480 #define CL_UNORM_SHORT_565                          0x10D4
481 #define CL_UNORM_SHORT_555                          0x10D5
482 #define CL_UNORM_INT_101010                         0x10D6
483 #define CL_SIGNED_INT8                              0x10D7
484 #define CL_SIGNED_INT16                             0x10D8
485 #define CL_SIGNED_INT32                             0x10D9
486 #define CL_UNSIGNED_INT8                            0x10DA
487 #define CL_UNSIGNED_INT16                           0x10DB
488 #define CL_UNSIGNED_INT32                           0x10DC
489 #define CL_HALF_FLOAT                               0x10DD
490 #define CL_FLOAT                                    0x10DE
491 #define CL_UNORM_INT24                              0x10DF
492
493 #define CL_MEM_OBJECT_BUFFER                        0x10F0
494 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
495 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
496 #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
497 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
498 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
499 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
500
501 #define CL_MEM_TYPE                                 0x1100
502 #define CL_MEM_FLAGS                                0x1101
503 #define CL_MEM_SIZE                                 0x1102
504 #define CL_MEM_HOST_PTR                             0x1103
505 #define CL_MEM_MAP_COUNT                            0x1104
506 #define CL_MEM_REFERENCE_COUNT                      0x1105
507 #define CL_MEM_CONTEXT                              0x1106
508 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
509 #define CL_MEM_OFFSET                               0x1108
510
511 #define CL_IMAGE_FORMAT                             0x1110
512 #define CL_IMAGE_ELEMENT_SIZE                       0x1111
513 #define CL_IMAGE_ROW_PITCH                          0x1112
514 #define CL_IMAGE_SLICE_PITCH                        0x1113
515 #define CL_IMAGE_WIDTH                              0x1114
516 #define CL_IMAGE_HEIGHT                             0x1115
517 #define CL_IMAGE_DEPTH                              0x1116
518 #define CL_IMAGE_ARRAY_SIZE                         0x1117
519 #define CL_IMAGE_BUFFER                             0x1118
520 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
521 #define CL_IMAGE_NUM_SAMPLES                        0x111A
522
523 #define CL_ADDRESS_NONE                             0x1130
524 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
525 #define CL_ADDRESS_CLAMP                            0x1132
526 #define CL_ADDRESS_REPEAT                           0x1133
527 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
528
529 #define CL_FILTER_NEAREST                           0x1140
530 #define CL_FILTER_LINEAR                            0x1141
531
532 #define CL_SAMPLER_REFERENCE_COUNT                  0x1150
533 #define CL_SAMPLER_CONTEXT                          0x1151
534 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
535 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
536 #define CL_SAMPLER_FILTER_MODE                      0x1154
537
538 #define CL_MAP_READ                                 (1 << 0)
539 #define CL_MAP_WRITE                                (1 << 1)
540 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
541
542 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
543 #define CL_PROGRAM_CONTEXT                          0x1161
544 #define CL_PROGRAM_NUM_DEVICES                      0x1162
545 #define CL_PROGRAM_DEVICES                          0x1163
546 #define CL_PROGRAM_SOURCE                           0x1164
547 #define CL_PROGRAM_BINARY_SIZES                     0x1165
548 #define CL_PROGRAM_BINARIES                         0x1166
549 #define CL_PROGRAM_NUM_KERNELS                      0x1167
550 #define CL_PROGRAM_KERNEL_NAMES                     0x1168
551 #define CL_PROGRAM_BUILD_STATUS                     0x1181
552 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
553 #define CL_PROGRAM_BUILD_LOG                        0x1183
554 #define CL_PROGRAM_BINARY_TYPE                      0x1184
555 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
556 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
557 #define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
558 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
559
560 #define CL_BUILD_SUCCESS                            0
561 #define CL_BUILD_NONE                               -1
562 #define CL_BUILD_ERROR                              -2
563 #define CL_BUILD_IN_PROGRESS                        -3
564
565 #define CL_KERNEL_FUNCTION_NAME                     0x1190
566 #define CL_KERNEL_NUM_ARGS                          0x1191
567 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
568 #define CL_KERNEL_CONTEXT                           0x1193
569 #define CL_KERNEL_PROGRAM                           0x1194
570 #define CL_KERNEL_ATTRIBUTES                        0x1195
571 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
572 #define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
573 #define CL_KERNEL_ARG_TYPE_NAME                     0x1198
574 #define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
575 #define CL_KERNEL_ARG_NAME                          0x119A
576 #define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
577 #define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
578 #define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
579 #define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
580 #define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
581 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
582 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
583 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
584 #define CL_KERNEL_ARG_TYPE_NONE                     0
585 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
586 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
587 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
588 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
589 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
590 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
591 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
592 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
593 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
594
595 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
596 #define CL_EVENT_COMMAND_TYPE                       0x11D1
597 #define CL_EVENT_REFERENCE_COUNT                    0x11D2
598 #define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
599 #define CL_EVENT_CONTEXT                            0x11D4
600
601 #define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
602 #define CL_COMMAND_TASK                             0x11F1
603 #define CL_COMMAND_NATIVE_KERNEL                    0x11F2
604 #define CL_COMMAND_READ_BUFFER                      0x11F3
605 #define CL_COMMAND_WRITE_BUFFER                     0x11F4
606 #define CL_COMMAND_COPY_BUFFER                      0x11F5
607 #define CL_COMMAND_READ_IMAGE                       0x11F6
608 #define CL_COMMAND_WRITE_IMAGE                      0x11F7
609 #define CL_COMMAND_COPY_IMAGE                       0x11F8
610 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
611 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
612 #define CL_COMMAND_MAP_BUFFER                       0x11FB
613 #define CL_COMMAND_MAP_IMAGE                        0x11FC
614 #define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
615 #define CL_COMMAND_MARKER                           0x11FE
616 #define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
617 #define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
618 #define CL_COMMAND_READ_BUFFER_RECT                 0x1201
619 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
620 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
621 #define CL_COMMAND_USER                             0x1204
622 #define CL_COMMAND_BARRIER                          0x1205
623 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
624 #define CL_COMMAND_FILL_BUFFER                      0x1207
625 #define CL_COMMAND_FILL_IMAGE                       0x1208
626
627 #define CL_COMPLETE                                 0x0
628 #define CL_RUNNING                                  0x1
629 #define CL_SUBMITTED                                0x2
630 #define CL_QUEUED                                   0x3
631 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
632
633 #define CL_PROFILING_COMMAND_QUEUED                 0x1280
634 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
635 #define CL_PROFILING_COMMAND_START                  0x1282
636 #define CL_PROFILING_COMMAND_END                    0x1283
637
638 #define CL_CALLBACK CV_STDCALL
639
640 static volatile bool g_haveOpenCL = false;
641 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
642
643 #if defined(__APPLE__)
644 #include <dlfcn.h>
645
646 static void* initOpenCLAndLoad(const char* funcname)
647 {
648     static bool initialized = false;
649     static void* handle = 0;
650     if (!handle)
651     {
652         if(!initialized)
653         {
654             const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
655             oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
656                 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
657             handle = dlopen(oclpath, RTLD_LAZY);
658             initialized = true;
659             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
660             if( g_haveOpenCL )
661                 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
662             else
663                 fprintf(stderr, "Failed to load OpenCL runtime\n");
664         }
665         if(!handle)
666             return 0;
667     }
668
669     return funcname && handle ? dlsym(handle, funcname) : 0;
670 }
671
672 #elif defined WIN32 || defined _WIN32
673
674 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
675   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
676 #endif
677 #include <windows.h>
678 #if (_WIN32_WINNT >= 0x0602)
679   #include <synchapi.h>
680 #endif
681 #undef small
682 #undef min
683 #undef max
684 #undef abs
685
686 static void* initOpenCLAndLoad(const char* funcname)
687 {
688     static bool initialized = false;
689     static HMODULE handle = 0;
690     if (!handle)
691     {
692 #ifndef HAVE_WINRT
693         if(!initialized)
694         {
695             handle = LoadLibraryA("OpenCL.dll");
696             initialized = true;
697             g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
698         }
699 #endif
700         if(!handle)
701             return 0;
702     }
703
704     return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
705 }
706
707 #elif defined(__linux)
708
709 #include <dlfcn.h>
710 #include <stdio.h>
711
712 static void* initOpenCLAndLoad(const char* funcname)
713 {
714     static bool initialized = false;
715     static void* handle = 0;
716     if (!handle)
717     {
718         if(!initialized)
719         {
720             handle = dlopen("libOpenCL.so", RTLD_LAZY);
721             if(!handle)
722                 handle = dlopen("libCL.so", RTLD_LAZY);
723             initialized = true;
724             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
725         }
726         if(!handle)
727             return 0;
728     }
729
730     return funcname ? (void*)dlsym(handle, funcname) : 0;
731 }
732
733 #else
734
735 static void* initOpenCLAndLoad(const char*)
736 {
737     return 0;
738 }
739
740 #endif
741
742
743 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
744     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
745     static rettype funcname argsdecl \
746     { \
747         static funcname##_t funcname##_p = 0; \
748         if( !funcname##_p ) \
749         { \
750             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
751             if( !funcname##_p ) \
752                 return OPENCV_CL_NOT_IMPLEMENTED; \
753         } \
754         return funcname##_p args; \
755     }
756
757
758 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
759     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
760     static rettype funcname argsdecl \
761     { \
762         static funcname##_t funcname##_p = 0; \
763         if( !funcname##_p ) \
764         { \
765             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
766             if( !funcname##_p ) \
767             { \
768                 if( errcode_ret ) \
769                     *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
770                 return 0; \
771             } \
772         } \
773         return funcname##_p args; \
774     }
775
776 OCL_FUNC(cl_int, clGetPlatformIDs,
777     (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
778     (num_entries, platforms, num_platforms))
779
780 OCL_FUNC(cl_int, clGetPlatformInfo,
781     (cl_platform_id platform, cl_platform_info param_name,
782     size_t param_value_size, void * param_value,
783     size_t * param_value_size_ret),
784     (platform, param_name, param_value_size, param_value, param_value_size_ret))
785
786 OCL_FUNC(cl_int, clGetDeviceInfo,
787          (cl_device_id device,
788           cl_device_info param_name,
789           size_t param_value_size,
790           void * param_value,
791           size_t * param_value_size_ret),
792          (device, param_name, param_value_size, param_value, param_value_size_ret))
793
794
795 OCL_FUNC(cl_int, clGetDeviceIDs,
796     (cl_platform_id platform,
797     cl_device_type device_type,
798     cl_uint num_entries,
799     cl_device_id * devices,
800     cl_uint * num_devices),
801     (platform, device_type, num_entries, devices, num_devices))
802
803 OCL_FUNC_P(cl_context, clCreateContext,
804     (const cl_context_properties * properties,
805     cl_uint num_devices,
806     const cl_device_id * devices,
807     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
808     void * user_data,
809     cl_int * errcode_ret),
810     (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
811
812 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
813
814 /*
815 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
816
817 OCL_FUNC_P(cl_context, clCreateContextFromType,
818     (const cl_context_properties * properties,
819     cl_device_type device_type,
820     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
821     void * user_data,
822     cl_int * errcode_ret),
823     (properties, device_type, pfn_notify, user_data, errcode_ret))
824
825 OCL_FUNC(cl_int, clGetContextInfo,
826     (cl_context context,
827     cl_context_info param_name,
828     size_t param_value_size,
829     void * param_value,
830     size_t * param_value_size_ret),
831     (context, param_name, param_value_size,
832     param_value, param_value_size_ret))
833 */
834 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
835     (cl_context context,
836     cl_device_id device,
837     cl_command_queue_properties properties,
838     cl_int * errcode_ret),
839     (context, device, properties, errcode_ret))
840
841 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
842
843 OCL_FUNC_P(cl_mem, clCreateBuffer,
844     (cl_context context,
845     cl_mem_flags flags,
846     size_t size,
847     void * host_ptr,
848     cl_int * errcode_ret),
849     (context, flags, size, host_ptr, errcode_ret))
850
851 /*
852 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
853
854 OCL_FUNC(cl_int, clGetCommandQueueInfo,
855  (cl_command_queue command_queue,
856  cl_command_queue_info param_name,
857  size_t param_value_size,
858  void * param_value,
859  size_t * param_value_size_ret),
860  (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
861
862 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
863     (cl_mem buffer,
864     cl_mem_flags flags,
865     cl_buffer_create_type buffer_create_type,
866     const void * buffer_create_info,
867     cl_int * errcode_ret),
868     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
869 */
870
871 OCL_FUNC_P(cl_mem, clCreateImage,
872     (cl_context context,
873     cl_mem_flags flags,
874     const cl_image_format * image_format,
875     const cl_image_desc * image_desc,
876     void * host_ptr,
877     cl_int * errcode_ret),
878     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
879
880 OCL_FUNC_P(cl_mem, clCreateImage2D,
881     (cl_context context,
882     cl_mem_flags flags,
883     const cl_image_format * image_format,
884     size_t image_width,
885     size_t image_height,
886     size_t image_row_pitch,
887     void * host_ptr,
888     cl_int *errcode_ret),
889     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
890
891 OCL_FUNC(cl_int, clGetSupportedImageFormats,
892  (cl_context context,
893  cl_mem_flags flags,
894  cl_mem_object_type image_type,
895  cl_uint num_entries,
896  cl_image_format * image_formats,
897  cl_uint * num_image_formats),
898  (context, flags, image_type, num_entries, image_formats, num_image_formats))
899
900 /*
901 OCL_FUNC(cl_int, clGetMemObjectInfo,
902  (cl_mem memobj,
903  cl_mem_info param_name,
904  size_t param_value_size,
905  void * param_value,
906  size_t * param_value_size_ret),
907  (memobj, param_name, param_value_size, param_value, param_value_size_ret))
908
909 OCL_FUNC(cl_int, clGetImageInfo,
910  (cl_mem image,
911  cl_image_info param_name,
912  size_t param_value_size,
913  void * param_value,
914  size_t * param_value_size_ret),
915  (image, param_name, param_value_size, param_value, param_value_size_ret))
916
917 OCL_FUNC(cl_int, clCreateKernelsInProgram,
918  (cl_program program,
919  cl_uint num_kernels,
920  cl_kernel * kernels,
921  cl_uint * num_kernels_ret),
922  (program, num_kernels, kernels, num_kernels_ret))
923
924 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
925
926 OCL_FUNC(cl_int, clGetKernelArgInfo,
927  (cl_kernel kernel,
928  cl_uint arg_indx,
929  cl_kernel_arg_info param_name,
930  size_t param_value_size,
931  void * param_value,
932  size_t * param_value_size_ret),
933  (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
934
935 OCL_FUNC(cl_int, clEnqueueReadImage,
936  (cl_command_queue command_queue,
937  cl_mem image,
938  cl_bool blocking_read,
939  const size_t * origin[3],
940  const size_t * region[3],
941  size_t row_pitch,
942  size_t slice_pitch,
943  void * ptr,
944  cl_uint num_events_in_wait_list,
945  const cl_event * event_wait_list,
946  cl_event * event),
947  (command_queue, image, blocking_read, origin, region,
948  row_pitch, slice_pitch,
949  ptr,
950  num_events_in_wait_list,
951  event_wait_list,
952  event))
953
954 OCL_FUNC(cl_int, clEnqueueWriteImage,
955  (cl_command_queue command_queue,
956  cl_mem image,
957  cl_bool blocking_write,
958  const size_t * origin[3],
959  const size_t * region[3],
960  size_t input_row_pitch,
961  size_t input_slice_pitch,
962  const void * ptr,
963  cl_uint num_events_in_wait_list,
964  const cl_event * event_wait_list,
965  cl_event * event),
966  (command_queue, image, blocking_write, origin, region, input_row_pitch,
967  input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
968
969 OCL_FUNC(cl_int, clEnqueueFillImage,
970  (cl_command_queue command_queue,
971  cl_mem image,
972  const void * fill_color,
973  const size_t * origin[3],
974  const size_t * region[3],
975  cl_uint num_events_in_wait_list,
976  const cl_event * event_wait_list,
977  cl_event * event),
978  (command_queue, image, fill_color, origin, region,
979  num_events_in_wait_list, event_wait_list, event))
980
981 OCL_FUNC(cl_int, clEnqueueCopyImage,
982  (cl_command_queue command_queue,
983  cl_mem src_image,
984  cl_mem dst_image,
985  const size_t * src_origin[3],
986  const size_t * dst_origin[3],
987  const size_t * region[3],
988  cl_uint num_events_in_wait_list,
989  const cl_event * event_wait_list,
990  cl_event * event),
991  (command_queue, src_image, dst_image, src_origin, dst_origin,
992  region, num_events_in_wait_list, event_wait_list, event))
993
994 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
995  (cl_command_queue command_queue,
996  cl_mem src_image,
997  cl_mem dst_buffer,
998  const size_t * src_origin[3],
999  const size_t * region[3],
1000  size_t dst_offset,
1001  cl_uint num_events_in_wait_list,
1002  const cl_event * event_wait_list,
1003  cl_event * event),
1004  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1005  num_events_in_wait_list, event_wait_list, event))
1006 */
1007
1008 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1009  (cl_command_queue command_queue,
1010  cl_mem src_buffer,
1011  cl_mem dst_image,
1012  size_t src_offset,
1013  const size_t dst_origin[3],
1014  const size_t region[3],
1015  cl_uint num_events_in_wait_list,
1016  const cl_event * event_wait_list,
1017  cl_event * event),
1018  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1019  region, num_events_in_wait_list, event_wait_list, event))
1020
1021  OCL_FUNC(cl_int, clFlush,
1022  (cl_command_queue command_queue),
1023  (command_queue))
1024
1025 /*
1026 OCL_FUNC_P(void*, clEnqueueMapImage,
1027  (cl_command_queue command_queue,
1028  cl_mem image,
1029  cl_bool blocking_map,
1030  cl_map_flags map_flags,
1031  const size_t * origin[3],
1032  const size_t * region[3],
1033  size_t * image_row_pitch,
1034  size_t * image_slice_pitch,
1035  cl_uint num_events_in_wait_list,
1036  const cl_event * event_wait_list,
1037  cl_event * event,
1038  cl_int * errcode_ret),
1039  (command_queue, image, blocking_map, map_flags, origin, region,
1040  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1041  event_wait_list, event, errcode_ret))
1042 */
1043
1044 /*
1045 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1046
1047 OCL_FUNC(cl_int, clGetKernelInfo,
1048  (cl_kernel kernel,
1049  cl_kernel_info param_name,
1050  size_t param_value_size,
1051  void * param_value,
1052  size_t * param_value_size_ret),
1053  (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1054
1055 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1056
1057 */
1058
1059 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1060
1061
1062 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1063     (cl_context context,
1064     cl_uint count,
1065     const char ** strings,
1066     const size_t * lengths,
1067     cl_int * errcode_ret),
1068     (context, count, strings, lengths, errcode_ret))
1069
1070 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1071     (cl_context context,
1072     cl_uint num_devices,
1073     const cl_device_id * device_list,
1074     const size_t * lengths,
1075     const unsigned char ** binaries,
1076     cl_int * binary_status,
1077     cl_int * errcode_ret),
1078     (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1079
1080 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1081
1082 OCL_FUNC(cl_int, clBuildProgram,
1083     (cl_program program,
1084     cl_uint num_devices,
1085     const cl_device_id * device_list,
1086     const char * options,
1087     void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1088     void * user_data),
1089     (program, num_devices, device_list, options, pfn_notify, user_data))
1090
1091 OCL_FUNC(cl_int, clGetProgramInfo,
1092     (cl_program program,
1093     cl_program_info param_name,
1094     size_t param_value_size,
1095     void * param_value,
1096     size_t * param_value_size_ret),
1097     (program, param_name, param_value_size, param_value, param_value_size_ret))
1098
1099 OCL_FUNC(cl_int, clGetProgramBuildInfo,
1100     (cl_program program,
1101     cl_device_id device,
1102     cl_program_build_info param_name,
1103     size_t param_value_size,
1104     void * param_value,
1105     size_t * param_value_size_ret),
1106     (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1107
1108 OCL_FUNC_P(cl_kernel, clCreateKernel,
1109     (cl_program program,
1110     const char * kernel_name,
1111     cl_int * errcode_ret),
1112     (program, kernel_name, errcode_ret))
1113
1114 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1115
1116 OCL_FUNC(cl_int, clSetKernelArg,
1117     (cl_kernel kernel,
1118     cl_uint arg_index,
1119     size_t arg_size,
1120     const void * arg_value),
1121     (kernel, arg_index, arg_size, arg_value))
1122
1123 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1124     (cl_kernel kernel,
1125     cl_device_id device,
1126     cl_kernel_work_group_info param_name,
1127     size_t param_value_size,
1128     void * param_value,
1129     size_t * param_value_size_ret),
1130     (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1131
1132 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1133
1134 OCL_FUNC(cl_int, clEnqueueReadBuffer,
1135     (cl_command_queue command_queue,
1136     cl_mem buffer,
1137     cl_bool blocking_read,
1138     size_t offset,
1139     size_t size,
1140     void * ptr,
1141     cl_uint num_events_in_wait_list,
1142     const cl_event * event_wait_list,
1143     cl_event * event),
1144     (command_queue, buffer, blocking_read, offset, size, ptr,
1145     num_events_in_wait_list, event_wait_list, event))
1146
1147 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1148     (cl_command_queue command_queue,
1149     cl_mem buffer,
1150     cl_bool blocking_read,
1151     const size_t * buffer_offset,
1152     const size_t * host_offset,
1153     const size_t * region,
1154     size_t buffer_row_pitch,
1155     size_t buffer_slice_pitch,
1156     size_t host_row_pitch,
1157     size_t host_slice_pitch,
1158     void * ptr,
1159     cl_uint num_events_in_wait_list,
1160     const cl_event * event_wait_list,
1161     cl_event * event),
1162     (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1163     buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1164     event_wait_list, event))
1165
1166 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1167     (cl_command_queue command_queue,
1168     cl_mem buffer,
1169     cl_bool blocking_write,
1170     size_t offset,
1171     size_t size,
1172     const void * ptr,
1173     cl_uint num_events_in_wait_list,
1174     const cl_event * event_wait_list,
1175     cl_event * event),
1176     (command_queue, buffer, blocking_write, offset, size, ptr,
1177     num_events_in_wait_list, event_wait_list, event))
1178
1179 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1180     (cl_command_queue command_queue,
1181     cl_mem buffer,
1182     cl_bool blocking_write,
1183     const size_t * buffer_offset,
1184     const size_t * host_offset,
1185     const size_t * region,
1186     size_t buffer_row_pitch,
1187     size_t buffer_slice_pitch,
1188     size_t host_row_pitch,
1189     size_t host_slice_pitch,
1190     const void * ptr,
1191     cl_uint num_events_in_wait_list,
1192     const cl_event * event_wait_list,
1193     cl_event * event),
1194     (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1195     region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1196     host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1197
1198 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1199     (cl_command_queue command_queue,
1200     cl_mem buffer,
1201     const void * pattern,
1202     size_t pattern_size,
1203     size_t offset,
1204     size_t size,
1205     cl_uint num_events_in_wait_list,
1206     const cl_event * event_wait_list,
1207     cl_event * event),
1208     (command_queue, buffer, pattern, pattern_size, offset, size,
1209     num_events_in_wait_list, event_wait_list, event))*/
1210
1211 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1212     (cl_command_queue command_queue,
1213     cl_mem src_buffer,
1214     cl_mem dst_buffer,
1215     size_t src_offset,
1216     size_t dst_offset,
1217     size_t size,
1218     cl_uint num_events_in_wait_list,
1219     const cl_event * event_wait_list,
1220     cl_event * event),
1221     (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1222     size, num_events_in_wait_list, event_wait_list, event))
1223
1224 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1225     (cl_command_queue command_queue,
1226     cl_mem src_buffer,
1227     cl_mem dst_buffer,
1228     const size_t * src_origin,
1229     const size_t * dst_origin,
1230     const size_t * region,
1231     size_t src_row_pitch,
1232     size_t src_slice_pitch,
1233     size_t dst_row_pitch,
1234     size_t dst_slice_pitch,
1235     cl_uint num_events_in_wait_list,
1236     const cl_event * event_wait_list,
1237     cl_event * event),
1238     (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1239     region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1240     num_events_in_wait_list, event_wait_list, event))
1241
1242 OCL_FUNC_P(void*, clEnqueueMapBuffer,
1243     (cl_command_queue command_queue,
1244     cl_mem buffer,
1245     cl_bool blocking_map,
1246     cl_map_flags map_flags,
1247     size_t offset,
1248     size_t size,
1249     cl_uint num_events_in_wait_list,
1250     const cl_event * event_wait_list,
1251     cl_event * event,
1252     cl_int * errcode_ret),
1253     (command_queue, buffer, blocking_map, map_flags, offset, size,
1254     num_events_in_wait_list, event_wait_list, event, errcode_ret))
1255
1256 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1257     (cl_command_queue command_queue,
1258     cl_mem memobj,
1259     void * mapped_ptr,
1260     cl_uint num_events_in_wait_list,
1261     const cl_event * event_wait_list,
1262     cl_event * event),
1263     (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1264
1265 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1266     (cl_command_queue command_queue,
1267     cl_kernel kernel,
1268     cl_uint work_dim,
1269     const size_t * global_work_offset,
1270     const size_t * global_work_size,
1271     const size_t * local_work_size,
1272     cl_uint num_events_in_wait_list,
1273     const cl_event * event_wait_list,
1274     cl_event * event),
1275     (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1276     local_work_size, num_events_in_wait_list, event_wait_list, event))
1277
1278 OCL_FUNC(cl_int, clEnqueueTask,
1279     (cl_command_queue command_queue,
1280     cl_kernel kernel,
1281     cl_uint num_events_in_wait_list,
1282     const cl_event * event_wait_list,
1283     cl_event * event),
1284     (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1285
1286 OCL_FUNC(cl_int, clSetEventCallback,
1287     (cl_event event,
1288     cl_int command_exec_callback_type ,
1289     void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1290     void *user_data),
1291     (event, command_exec_callback_type, pfn_event_notify, user_data))
1292
1293 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1294
1295 }
1296
1297 #endif
1298
1299 #ifndef CL_VERSION_1_2
1300 #define CL_VERSION_1_2
1301 #endif
1302
1303 #endif
1304
1305 #ifdef _DEBUG
1306 #define CV_OclDbgAssert CV_DbgAssert
1307 #else
1308 #define CV_OclDbgAssert(expr) (void)(expr)
1309 #endif
1310
1311 namespace cv { namespace ocl {
1312
1313 struct UMat2D
1314 {
1315     UMat2D(const UMat& m)
1316     {
1317         offset = (int)m.offset;
1318         step = (int)m.step;
1319         rows = m.rows;
1320         cols = m.cols;
1321     }
1322     int offset;
1323     int step;
1324     int rows;
1325     int cols;
1326 };
1327
1328 struct UMat3D
1329 {
1330     UMat3D(const UMat& m)
1331     {
1332         offset = (int)m.offset;
1333         step = (int)m.step.p[1];
1334         slicestep = (int)m.step.p[0];
1335         slices = (int)m.size.p[0];
1336         rows = m.size.p[1];
1337         cols = m.size.p[2];
1338     }
1339     int offset;
1340     int slicestep;
1341     int step;
1342     int slices;
1343     int rows;
1344     int cols;
1345 };
1346
1347 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
1348 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1349 {
1350     static uint64 table[256];
1351     static bool initialized = false;
1352
1353     if( !initialized )
1354     {
1355         for( int i = 0; i < 256; i++ )
1356         {
1357             uint64 c = i;
1358             for( int j = 0; j < 8; j++ )
1359                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1360             table[i] = c;
1361         }
1362         initialized = true;
1363     }
1364
1365     uint64 crc = ~crc0;
1366     for( size_t idx = 0; idx < size; idx++ )
1367         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1368
1369     return ~crc;
1370 }
1371
1372 struct HashKey
1373 {
1374     typedef uint64 part;
1375     HashKey(part _a, part _b) : a(_a), b(_b) {}
1376     part a, b;
1377 };
1378
1379 inline bool operator == (const HashKey& h1, const HashKey& h2)
1380 {
1381     return h1.a == h2.a && h1.b == h2.b;
1382 }
1383
1384 inline bool operator < (const HashKey& h1, const HashKey& h2)
1385 {
1386     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1387 }
1388
1389
1390 bool haveOpenCL()
1391 {
1392 #ifdef HAVE_OPENCL
1393     static bool g_isOpenCLInitialized = false;
1394     static bool g_isOpenCLAvailable = false;
1395
1396     if (!g_isOpenCLInitialized)
1397     {
1398         try
1399         {
1400             cl_uint n = 0;
1401             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1402         }
1403         catch (...)
1404         {
1405             g_isOpenCLAvailable = false;
1406         }
1407         g_isOpenCLInitialized = true;
1408     }
1409     return g_isOpenCLAvailable;
1410 #else
1411     return false;
1412 #endif
1413 }
1414
1415 bool useOpenCL()
1416 {
1417     CoreTLSData* data = coreTlsData.get();
1418     if( data->useOpenCL < 0 )
1419         data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
1420     return data->useOpenCL > 0;
1421 }
1422
1423 void setUseOpenCL(bool flag)
1424 {
1425     if( haveOpenCL() )
1426     {
1427         CoreTLSData* data = coreTlsData.get();
1428         data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1429     }
1430 }
1431
1432 #ifdef HAVE_CLAMDBLAS
1433
1434 class AmdBlasHelper
1435 {
1436 public:
1437     static AmdBlasHelper & getInstance()
1438     {
1439         static AmdBlasHelper amdBlas;
1440         return amdBlas;
1441     }
1442
1443     bool isAvailable() const
1444     {
1445         return g_isAmdBlasAvailable;
1446     }
1447
1448     ~AmdBlasHelper()
1449     {
1450         try
1451         {
1452             clAmdBlasTeardown();
1453         }
1454         catch (...) { }
1455     }
1456
1457 protected:
1458     AmdBlasHelper()
1459     {
1460         if (!g_isAmdBlasInitialized)
1461         {
1462             AutoLock lock(m);
1463
1464             if (!g_isAmdBlasInitialized && haveOpenCL())
1465             {
1466                 try
1467                 {
1468                     g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1469                 }
1470                 catch (...)
1471                 {
1472                     g_isAmdBlasAvailable = false;
1473                 }
1474             }
1475             else
1476                 g_isAmdBlasAvailable = false;
1477
1478             g_isAmdBlasInitialized = true;
1479         }
1480     }
1481
1482 private:
1483     static Mutex m;
1484     static bool g_isAmdBlasInitialized;
1485     static bool g_isAmdBlasAvailable;
1486 };
1487
1488 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1489 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1490 Mutex AmdBlasHelper::m;
1491
1492 bool haveAmdBlas()
1493 {
1494     return AmdBlasHelper::getInstance().isAvailable();
1495 }
1496
1497 #else
1498
1499 bool haveAmdBlas()
1500 {
1501     return false;
1502 }
1503
1504 #endif
1505
1506 #ifdef HAVE_CLAMDFFT
1507
1508 class AmdFftHelper
1509 {
1510 public:
1511     static AmdFftHelper & getInstance()
1512     {
1513         static AmdFftHelper amdFft;
1514         return amdFft;
1515     }
1516
1517     bool isAvailable() const
1518     {
1519         return g_isAmdFftAvailable;
1520     }
1521
1522     ~AmdFftHelper()
1523     {
1524         try
1525         {
1526 //            clAmdFftTeardown();
1527         }
1528         catch (...) { }
1529     }
1530
1531 protected:
1532     AmdFftHelper()
1533     {
1534         if (!g_isAmdFftInitialized)
1535         {
1536             AutoLock lock(m);
1537
1538             if (!g_isAmdFftInitialized && haveOpenCL())
1539             {
1540                 try
1541                 {
1542                     CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1543                     g_isAmdFftAvailable = true;
1544                 }
1545                 catch (const Exception &)
1546                 {
1547                     g_isAmdFftAvailable = false;
1548                 }
1549             }
1550             else
1551                 g_isAmdFftAvailable = false;
1552
1553             g_isAmdFftInitialized = true;
1554         }
1555     }
1556
1557 private:
1558     static clAmdFftSetupData setupData;
1559     static Mutex m;
1560     static bool g_isAmdFftInitialized;
1561     static bool g_isAmdFftAvailable;
1562 };
1563
1564 clAmdFftSetupData AmdFftHelper::setupData;
1565 bool AmdFftHelper::g_isAmdFftAvailable = false;
1566 bool AmdFftHelper::g_isAmdFftInitialized = false;
1567 Mutex AmdFftHelper::m;
1568
1569 bool haveAmdFft()
1570 {
1571     return AmdFftHelper::getInstance().isAvailable();
1572 }
1573
1574 #else
1575
1576 bool haveAmdFft()
1577 {
1578     return false;
1579 }
1580
1581 #endif
1582
1583 void finish()
1584 {
1585     Queue::getDefault().finish();
1586 }
1587
1588 #define IMPLEMENT_REFCOUNTABLE() \
1589     void addref() { CV_XADD(&refcount, 1); } \
1590     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1591     int refcount
1592
1593 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1594
1595 struct Platform::Impl
1596 {
1597     Impl()
1598     {
1599         refcount = 1;
1600         handle = 0;
1601         initialized = false;
1602     }
1603
1604     ~Impl() {}
1605
1606     void init()
1607     {
1608         if( !initialized )
1609         {
1610             //cl_uint num_entries
1611             cl_uint n = 0;
1612             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1613                 handle = 0;
1614             if( handle != 0 )
1615             {
1616                 char buf[1000];
1617                 size_t len = 0;
1618                 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1619                 buf[len] = '\0';
1620                 vendor = String(buf);
1621             }
1622
1623             initialized = true;
1624         }
1625     }
1626
1627     IMPLEMENT_REFCOUNTABLE();
1628
1629     cl_platform_id handle;
1630     String vendor;
1631     bool initialized;
1632 };
1633
1634 Platform::Platform()
1635 {
1636     p = 0;
1637 }
1638
1639 Platform::~Platform()
1640 {
1641     if(p)
1642         p->release();
1643 }
1644
1645 Platform::Platform(const Platform& pl)
1646 {
1647     p = (Impl*)pl.p;
1648     if(p)
1649         p->addref();
1650 }
1651
1652 Platform& Platform::operator = (const Platform& pl)
1653 {
1654     Impl* newp = (Impl*)pl.p;
1655     if(newp)
1656         newp->addref();
1657     if(p)
1658         p->release();
1659     p = newp;
1660     return *this;
1661 }
1662
1663 void* Platform::ptr() const
1664 {
1665     return p ? p->handle : 0;
1666 }
1667
1668 Platform& Platform::getDefault()
1669 {
1670     static Platform p;
1671     if( !p.p )
1672     {
1673         p.p = new Impl;
1674         p.p->init();
1675     }
1676     return p;
1677 }
1678
1679 /////////////////////////////////////// Device ////////////////////////////////////////////
1680
1681 // deviceVersion has format
1682 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1683 // by specification
1684 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1685 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1686 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1687 {
1688     major = minor = 0;
1689     if (10 >= deviceVersion.length())
1690         return;
1691     const char *pstr = deviceVersion.c_str();
1692     if (0 != strncmp(pstr, "OpenCL ", 7))
1693         return;
1694     size_t ppos = deviceVersion.find('.', 7);
1695     if (String::npos == ppos)
1696         return;
1697     String temp = deviceVersion.substr(7, ppos - 7);
1698     major = atoi(temp.c_str());
1699     temp = deviceVersion.substr(ppos + 1);
1700     minor = atoi(temp.c_str());
1701 }
1702
1703 struct Device::Impl
1704 {
1705     Impl(void* d)
1706     {
1707         handle = (cl_device_id)d;
1708         refcount = 1;
1709
1710         name_ = getStrProp(CL_DEVICE_NAME);
1711         version_ = getStrProp(CL_DEVICE_VERSION);
1712         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1713         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1714         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1715         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1716         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1717         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1718
1719         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1720         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1721
1722         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1723         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1724             vendorName_ == "AMD")
1725             vendorID_ = VENDOR_AMD;
1726         else if (vendorName_ == "Intel(R) Corporation")
1727             vendorID_ = VENDOR_INTEL;
1728         else if (vendorName_ == "NVIDIA Corporation")
1729             vendorID_ = VENDOR_NVIDIA;
1730         else
1731             vendorID_ = UNKNOWN_VENDOR;
1732     }
1733
1734     template<typename _TpCL, typename _TpOut>
1735     _TpOut getProp(cl_device_info prop) const
1736     {
1737         _TpCL temp=_TpCL();
1738         size_t sz = 0;
1739
1740         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1741             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1742     }
1743
1744     bool getBoolProp(cl_device_info prop) const
1745     {
1746         cl_bool temp = CL_FALSE;
1747         size_t sz = 0;
1748
1749         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1750             sz == sizeof(temp) ? temp != 0 : false;
1751     }
1752
1753     String getStrProp(cl_device_info prop) const
1754     {
1755         char buf[1024];
1756         size_t sz=0;
1757         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1758             sz < sizeof(buf) ? String(buf) : String();
1759     }
1760
1761     IMPLEMENT_REFCOUNTABLE();
1762     cl_device_id handle;
1763
1764     String name_;
1765     String version_;
1766     int doubleFPConfig_;
1767     bool hostUnifiedMemory_;
1768     int maxComputeUnits_;
1769     size_t maxWorkGroupSize_;
1770     int type_;
1771     int deviceVersionMajor_;
1772     int deviceVersionMinor_;
1773     String driverVersion_;
1774     String vendorName_;
1775     int vendorID_;
1776 };
1777
1778
1779 Device::Device()
1780 {
1781     p = 0;
1782 }
1783
1784 Device::Device(void* d)
1785 {
1786     p = 0;
1787     set(d);
1788 }
1789
1790 Device::Device(const Device& d)
1791 {
1792     p = d.p;
1793     if(p)
1794         p->addref();
1795 }
1796
1797 Device& Device::operator = (const Device& d)
1798 {
1799     Impl* newp = (Impl*)d.p;
1800     if(newp)
1801         newp->addref();
1802     if(p)
1803         p->release();
1804     p = newp;
1805     return *this;
1806 }
1807
1808 Device::~Device()
1809 {
1810     if(p)
1811         p->release();
1812 }
1813
1814 void Device::set(void* d)
1815 {
1816     if(p)
1817         p->release();
1818     p = new Impl(d);
1819 }
1820
1821 void* Device::ptr() const
1822 {
1823     return p ? p->handle : 0;
1824 }
1825
1826 String Device::name() const
1827 { return p ? p->name_ : String(); }
1828
1829 String Device::extensions() const
1830 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1831
1832 String Device::version() const
1833 { return p ? p->version_ : String(); }
1834
1835 String Device::vendorName() const
1836 { return p ? p->vendorName_ : String(); }
1837
1838 int Device::vendorID() const
1839 { return p ? p->vendorID_ : 0; }
1840
1841 String Device::OpenCL_C_Version() const
1842 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1843
1844 String Device::OpenCLVersion() const
1845 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1846
1847 int Device::deviceVersionMajor() const
1848 { return p ? p->deviceVersionMajor_ : 0; }
1849
1850 int Device::deviceVersionMinor() const
1851 { return p ? p->deviceVersionMinor_ : 0; }
1852
1853 String Device::driverVersion() const
1854 { return p ? p->driverVersion_ : String(); }
1855
1856 int Device::type() const
1857 { return p ? p->type_ : 0; }
1858
1859 int Device::addressBits() const
1860 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1861
1862 bool Device::available() const
1863 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1864
1865 bool Device::compilerAvailable() const
1866 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1867
1868 bool Device::linkerAvailable() const
1869 #ifdef CL_VERSION_1_2
1870 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1871 #else
1872 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1873 #endif
1874
1875 int Device::doubleFPConfig() const
1876 { return p ? p->doubleFPConfig_ : 0; }
1877
1878 int Device::singleFPConfig() const
1879 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1880
1881 int Device::halfFPConfig() const
1882 #ifdef CL_VERSION_1_2
1883 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1884 #else
1885 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1886 #endif
1887
1888 bool Device::endianLittle() const
1889 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1890
1891 bool Device::errorCorrectionSupport() const
1892 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1893
1894 int Device::executionCapabilities() const
1895 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1896
1897 size_t Device::globalMemCacheSize() const
1898 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1899
1900 int Device::globalMemCacheType() const
1901 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1902
1903 int Device::globalMemCacheLineSize() const
1904 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1905
1906 size_t Device::globalMemSize() const
1907 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1908
1909 size_t Device::localMemSize() const
1910 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1911
1912 int Device::localMemType() const
1913 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1914
1915 bool Device::hostUnifiedMemory() const
1916 { return p ? p->hostUnifiedMemory_ : false; }
1917
1918 bool Device::imageSupport() const
1919 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1920
1921 bool Device::imageFromBufferSupport() const
1922 {
1923     bool ret = false;
1924     if (p)
1925     {
1926         size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
1927         if (pos != String::npos)
1928         {
1929             ret = true;
1930         }
1931     }
1932     return ret;
1933 }
1934
1935 uint Device::imagePitchAlignment() const
1936 {
1937 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1938     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1939 #else
1940     return 0;
1941 #endif
1942 }
1943
1944 uint Device::imageBaseAddressAlignment() const
1945 {
1946 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1947     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1948 #else
1949     return 0;
1950 #endif
1951 }
1952
1953 size_t Device::image2DMaxWidth() const
1954 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1955
1956 size_t Device::image2DMaxHeight() const
1957 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1958
1959 size_t Device::image3DMaxWidth() const
1960 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1961
1962 size_t Device::image3DMaxHeight() const
1963 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1964
1965 size_t Device::image3DMaxDepth() const
1966 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1967
1968 size_t Device::imageMaxBufferSize() const
1969 #ifdef CL_VERSION_1_2
1970 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1971 #else
1972 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1973 #endif
1974
1975 size_t Device::imageMaxArraySize() const
1976 #ifdef CL_VERSION_1_2
1977 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1978 #else
1979 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1980 #endif
1981
1982 int Device::maxClockFrequency() const
1983 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1984
1985 int Device::maxComputeUnits() const
1986 { return p ? p->maxComputeUnits_ : 0; }
1987
1988 int Device::maxConstantArgs() const
1989 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1990
1991 size_t Device::maxConstantBufferSize() const
1992 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
1993
1994 size_t Device::maxMemAllocSize() const
1995 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
1996
1997 size_t Device::maxParameterSize() const
1998 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
1999
2000 int Device::maxReadImageArgs() const
2001 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2002
2003 int Device::maxWriteImageArgs() const
2004 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2005
2006 int Device::maxSamplers() const
2007 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2008
2009 size_t Device::maxWorkGroupSize() const
2010 { return p ? p->maxWorkGroupSize_ : 0; }
2011
2012 int Device::maxWorkItemDims() const
2013 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2014
2015 void Device::maxWorkItemSizes(size_t* sizes) const
2016 {
2017     if(p)
2018     {
2019         const int MAX_DIMS = 32;
2020         size_t retsz = 0;
2021         CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2022                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2023     }
2024 }
2025
2026 int Device::memBaseAddrAlign() const
2027 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2028
2029 int Device::nativeVectorWidthChar() const
2030 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2031
2032 int Device::nativeVectorWidthShort() const
2033 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2034
2035 int Device::nativeVectorWidthInt() const
2036 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2037
2038 int Device::nativeVectorWidthLong() const
2039 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2040
2041 int Device::nativeVectorWidthFloat() const
2042 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2043
2044 int Device::nativeVectorWidthDouble() const
2045 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2046
2047 int Device::nativeVectorWidthHalf() const
2048 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2049
2050 int Device::preferredVectorWidthChar() const
2051 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2052
2053 int Device::preferredVectorWidthShort() const
2054 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2055
2056 int Device::preferredVectorWidthInt() const
2057 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2058
2059 int Device::preferredVectorWidthLong() const
2060 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2061
2062 int Device::preferredVectorWidthFloat() const
2063 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2064
2065 int Device::preferredVectorWidthDouble() const
2066 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2067
2068 int Device::preferredVectorWidthHalf() const
2069 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2070
2071 size_t Device::printfBufferSize() const
2072 #ifdef CL_VERSION_1_2
2073 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2074 #else
2075 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2076 #endif
2077
2078
2079 size_t Device::profilingTimerResolution() const
2080 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2081
2082 const Device& Device::getDefault()
2083 {
2084     const Context& ctx = Context::getDefault();
2085     int idx = coreTlsData.get()->device;
2086     return ctx.device(idx);
2087 }
2088
2089 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2090
2091 template <typename Functor, typename ObjectType>
2092 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2093 {
2094     ::size_t required;
2095     cl_int err = f(obj, name, 0, NULL, &required);
2096     if (err != CL_SUCCESS)
2097         return err;
2098
2099     param.clear();
2100     if (required > 0)
2101     {
2102         AutoBuffer<char> buf(required + 1);
2103         char* ptr = (char*)buf; // cleanup is not needed
2104         err = f(obj, name, required, ptr, NULL);
2105         if (err != CL_SUCCESS)
2106             return err;
2107         param = ptr;
2108     }
2109
2110     return CL_SUCCESS;
2111 }
2112
2113 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2114 {
2115     elems.clear();
2116     if (s.size() == 0)
2117         return;
2118     std::istringstream ss(s);
2119     std::string item;
2120     while (!ss.eof())
2121     {
2122         std::getline(ss, item, delim);
2123         elems.push_back(item);
2124     }
2125 }
2126
2127 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2128 // Sample: AMD:GPU:
2129 // Sample: AMD:GPU:Tahiti
2130 // Sample: :GPU|CPU: = '' = ':' = '::'
2131 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2132         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2133 {
2134     std::vector<std::string> parts;
2135     split(configurationStr, ':', parts);
2136     if (parts.size() > 3)
2137     {
2138         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2139         return false;
2140     }
2141     if (parts.size() > 2)
2142         deviceNameOrID = parts[2];
2143     if (parts.size() > 1)
2144     {
2145         split(parts[1], '|', deviceTypes);
2146     }
2147     if (parts.size() > 0)
2148     {
2149         platform = parts[0];
2150     }
2151     return true;
2152 }
2153
2154 #ifdef HAVE_WINRT
2155 static cl_device_id selectOpenCLDevice()
2156 {
2157     return NULL;
2158 }
2159 #else
2160 static cl_device_id selectOpenCLDevice()
2161 {
2162     std::string platform, deviceName;
2163     std::vector<std::string> deviceTypes;
2164
2165     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2166     if (configuration && !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
2167         return NULL;
2168
2169     bool isID = false;
2170     int deviceID = -1;
2171     if (deviceName.length() == 1)
2172     // We limit ID range to 0..9, because we want to write:
2173     // - '2500' to mean i5-2500
2174     // - '8350' to mean AMD FX-8350
2175     // - '650' to mean GeForce 650
2176     // To extend ID range change condition to '> 0'
2177     {
2178         isID = true;
2179         for (size_t i = 0; i < deviceName.length(); i++)
2180         {
2181             if (!isdigit(deviceName[i]))
2182             {
2183                 isID = false;
2184                 break;
2185             }
2186         }
2187         if (isID)
2188         {
2189             deviceID = atoi(deviceName.c_str());
2190             if (deviceID < 0)
2191                 return NULL;
2192         }
2193     }
2194
2195     std::vector<cl_platform_id> platforms;
2196     {
2197         cl_uint numPlatforms = 0;
2198         CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2199
2200         if (numPlatforms == 0)
2201             return NULL;
2202         platforms.resize((size_t)numPlatforms);
2203         CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2204         platforms.resize(numPlatforms);
2205     }
2206
2207     int selectedPlatform = -1;
2208     if (platform.length() > 0)
2209     {
2210         for (size_t i = 0; i < platforms.size(); i++)
2211         {
2212             std::string name;
2213             CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2214             if (name.find(platform) != std::string::npos)
2215             {
2216                 selectedPlatform = (int)i;
2217                 break;
2218             }
2219         }
2220         if (selectedPlatform == -1)
2221         {
2222             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2223             goto not_found;
2224         }
2225     }
2226     if (deviceTypes.size() == 0)
2227     {
2228         if (!isID)
2229         {
2230             deviceTypes.push_back("GPU");
2231             deviceTypes.push_back("CPU");
2232         }
2233         else
2234             deviceTypes.push_back("ALL");
2235     }
2236     for (size_t t = 0; t < deviceTypes.size(); t++)
2237     {
2238         int deviceType = 0;
2239         std::string tempStrDeviceType = deviceTypes[t];
2240         std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2241
2242         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2243             deviceType = Device::TYPE_GPU;
2244         else if (tempStrDeviceType == "cpu")
2245             deviceType = Device::TYPE_CPU;
2246         else if (tempStrDeviceType == "accelerator")
2247             deviceType = Device::TYPE_ACCELERATOR;
2248         else if (tempStrDeviceType == "all")
2249             deviceType = Device::TYPE_ALL;
2250         else
2251         {
2252             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2253             goto not_found;
2254         }
2255
2256         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2257         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2258                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2259                 i++)
2260         {
2261             cl_uint count = 0;
2262             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2263             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2264             if (count == 0)
2265                 continue;
2266             size_t base = devices.size();
2267             devices.resize(base + count);
2268             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2269             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2270         }
2271
2272         for (size_t i = (isID ? deviceID : 0);
2273              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2274              i++)
2275         {
2276             std::string name;
2277             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2278             cl_bool useGPU = true;
2279             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2280             {
2281                 cl_bool isIGPU = CL_FALSE;
2282                 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2283                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2284             }
2285             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2286             {
2287                 // TODO check for OpenCL 1.1
2288                 return devices[i];
2289             }
2290         }
2291     }
2292
2293 not_found:
2294     std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2295             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2296             << "    Device types: ";
2297     for (size_t t = 0; t < deviceTypes.size(); t++)
2298         std::cerr << deviceTypes[t] << " ";
2299
2300     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2301     CV_Error(CL_INVALID_DEVICE, "Requested OpenCL device is not found");
2302     return NULL;
2303 }
2304 #endif
2305
2306 struct Context::Impl
2307 {
2308     Impl()
2309     {
2310         refcount = 1;
2311         handle = 0;
2312     }
2313
2314     void setDefault()
2315     {
2316         CV_Assert(handle == NULL);
2317
2318         cl_device_id d = selectOpenCLDevice();
2319
2320         if (d == NULL)
2321             return;
2322
2323         cl_platform_id pl = NULL;
2324         CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2325
2326         cl_context_properties prop[] =
2327         {
2328             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2329             0
2330         };
2331
2332         // !!! in the current implementation force the number of devices to 1 !!!
2333         cl_uint nd = 1;
2334         cl_int status;
2335
2336         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2337
2338         bool ok = handle != 0 && status == CL_SUCCESS;
2339         if( ok )
2340         {
2341             devices.resize(nd);
2342             devices[0].set(d);
2343         }
2344         else
2345             handle = NULL;
2346     }
2347
2348     Impl(int dtype0)
2349     {
2350         refcount = 1;
2351         handle = 0;
2352
2353         cl_int retval = 0;
2354         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2355         cl_context_properties prop[] =
2356         {
2357             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2358             0
2359         };
2360
2361         cl_uint i, nd0 = 0, nd = 0;
2362         int dtype = dtype0 & 15;
2363         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2364
2365         AutoBuffer<void*> dlistbuf(nd0*2+1);
2366         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2367         cl_device_id* dlist_new = dlist + nd0;
2368         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2369         String name0;
2370
2371         for(i = 0; i < nd0; i++)
2372         {
2373             Device d(dlist[i]);
2374             if( !d.available() || !d.compilerAvailable() )
2375                 continue;
2376             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2377                 continue;
2378             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2379                 continue;
2380             String name = d.name();
2381             if( nd != 0 && name != name0 )
2382                 continue;
2383             name0 = name;
2384             dlist_new[nd++] = dlist[i];
2385         }
2386
2387         if(nd == 0)
2388             return;
2389
2390         // !!! in the current implementation force the number of devices to 1 !!!
2391         nd = 1;
2392
2393         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2394         bool ok = handle != 0 && retval == CL_SUCCESS;
2395         if( ok )
2396         {
2397             devices.resize(nd);
2398             for( i = 0; i < nd; i++ )
2399                 devices[i].set(dlist_new[i]);
2400         }
2401     }
2402
2403     ~Impl()
2404     {
2405         if(handle)
2406         {
2407             clReleaseContext(handle);
2408             handle = NULL;
2409         }
2410         devices.clear();
2411     }
2412
2413     Program getProg(const ProgramSource& src,
2414                     const String& buildflags, String& errmsg)
2415     {
2416         String prefix = Program::getPrefix(buildflags);
2417         HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2418         phash_t::iterator it = phash.find(k);
2419         if( it != phash.end() )
2420             return it->second;
2421         //String filename = format("%08x%08x_%08x%08x.clb2",
2422         Program prog(src, buildflags, errmsg);
2423         if(prog.ptr())
2424             phash.insert(std::pair<HashKey,Program>(k, prog));
2425         return prog;
2426     }
2427
2428     IMPLEMENT_REFCOUNTABLE();
2429
2430     cl_context handle;
2431     std::vector<Device> devices;
2432
2433     typedef ProgramSource::hash_t hash_t;
2434
2435     struct HashKey
2436     {
2437         HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
2438         bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
2439         bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
2440         bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2441         hash_t a, b;
2442     };
2443     typedef std::map<HashKey, Program> phash_t;
2444     phash_t phash;
2445 };
2446
2447
2448 Context::Context()
2449 {
2450     p = 0;
2451 }
2452
2453 Context::Context(int dtype)
2454 {
2455     p = 0;
2456     create(dtype);
2457 }
2458
2459 bool Context::create()
2460 {
2461     if( !haveOpenCL() )
2462         return false;
2463     if(p)
2464         p->release();
2465     p = new Impl();
2466     if(!p->handle)
2467     {
2468         delete p;
2469         p = 0;
2470     }
2471     return p != 0;
2472 }
2473
2474 bool Context::create(int dtype0)
2475 {
2476     if( !haveOpenCL() )
2477         return false;
2478     if(p)
2479         p->release();
2480     p = new Impl(dtype0);
2481     if(!p->handle)
2482     {
2483         delete p;
2484         p = 0;
2485     }
2486     return p != 0;
2487 }
2488
2489 Context::~Context()
2490 {
2491     if (p)
2492     {
2493         p->release();
2494         p = NULL;
2495     }
2496 }
2497
2498 Context::Context(const Context& c)
2499 {
2500     p = (Impl*)c.p;
2501     if(p)
2502         p->addref();
2503 }
2504
2505 Context& Context::operator = (const Context& c)
2506 {
2507     Impl* newp = (Impl*)c.p;
2508     if(newp)
2509         newp->addref();
2510     if(p)
2511         p->release();
2512     p = newp;
2513     return *this;
2514 }
2515
2516 void* Context::ptr() const
2517 {
2518     return p == NULL ? NULL : p->handle;
2519 }
2520
2521 size_t Context::ndevices() const
2522 {
2523     return p ? p->devices.size() : 0;
2524 }
2525
2526 const Device& Context::device(size_t idx) const
2527 {
2528     static Device dummy;
2529     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2530 }
2531
2532 Context& Context::getDefault(bool initialize)
2533 {
2534     static Context* ctx = new Context();
2535     if(!ctx->p && haveOpenCL())
2536     {
2537         if (!ctx->p)
2538             ctx->p = new Impl();
2539         if (initialize)
2540         {
2541             // do not create new Context right away.
2542             // First, try to retrieve existing context of the same type.
2543             // In its turn, Platform::getContext() may call Context::create()
2544             // if there is no such context.
2545             if (ctx->p->handle == NULL)
2546                 ctx->p->setDefault();
2547         }
2548     }
2549
2550     return *ctx;
2551 }
2552
2553 Program Context::getProg(const ProgramSource& prog,
2554                          const String& buildopts, String& errmsg)
2555 {
2556     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2557 }
2558
2559 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2560 {
2561     cl_context context = (cl_context)_context;
2562     cl_device_id device = (cl_device_id)_device;
2563
2564     // cleanup old context
2565     Context::Impl * impl = ctx.p;
2566     if (impl->handle)
2567     {
2568         CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2569     }
2570     impl->devices.clear();
2571
2572     impl->handle = context;
2573     impl->devices.resize(1);
2574     impl->devices[0].set(device);
2575
2576     Platform& p = Platform::getDefault();
2577     Platform::Impl* pImpl = p.p;
2578     pImpl->handle = (cl_platform_id)platform;
2579 }
2580
2581 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2582
2583 struct Queue::Impl
2584 {
2585     Impl(const Context& c, const Device& d)
2586     {
2587         refcount = 1;
2588         const Context* pc = &c;
2589         cl_context ch = (cl_context)pc->ptr();
2590         if( !ch )
2591         {
2592             pc = &Context::getDefault();
2593             ch = (cl_context)pc->ptr();
2594         }
2595         cl_device_id dh = (cl_device_id)d.ptr();
2596         if( !dh )
2597             dh = (cl_device_id)pc->device(0).ptr();
2598         cl_int retval = 0;
2599         handle = clCreateCommandQueue(ch, dh, 0, &retval);
2600         CV_OclDbgAssert(retval == CL_SUCCESS);
2601     }
2602
2603     ~Impl()
2604     {
2605 #ifdef _WIN32
2606         if (!cv::__termination)
2607 #endif
2608         {
2609             if(handle)
2610             {
2611                 clFinish(handle);
2612                 clReleaseCommandQueue(handle);
2613                 handle = NULL;
2614             }
2615         }
2616     }
2617
2618     IMPLEMENT_REFCOUNTABLE();
2619
2620     cl_command_queue handle;
2621 };
2622
2623 Queue::Queue()
2624 {
2625     p = 0;
2626 }
2627
2628 Queue::Queue(const Context& c, const Device& d)
2629 {
2630     p = 0;
2631     create(c, d);
2632 }
2633
2634 Queue::Queue(const Queue& q)
2635 {
2636     p = q.p;
2637     if(p)
2638         p->addref();
2639 }
2640
2641 Queue& Queue::operator = (const Queue& q)
2642 {
2643     Impl* newp = (Impl*)q.p;
2644     if(newp)
2645         newp->addref();
2646     if(p)
2647         p->release();
2648     p = newp;
2649     return *this;
2650 }
2651
2652 Queue::~Queue()
2653 {
2654     if(p)
2655         p->release();
2656 }
2657
2658 bool Queue::create(const Context& c, const Device& d)
2659 {
2660     if(p)
2661         p->release();
2662     p = new Impl(c, d);
2663     return p->handle != 0;
2664 }
2665
2666 void Queue::finish()
2667 {
2668     if(p && p->handle)
2669     {
2670         CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
2671     }
2672 }
2673
2674 void* Queue::ptr() const
2675 {
2676     return p ? p->handle : 0;
2677 }
2678
2679 Queue& Queue::getDefault()
2680 {
2681     Queue& q = coreTlsData.get()->oclQueue;
2682     if( !q.p && haveOpenCL() )
2683         q.create(Context::getDefault());
2684     return q;
2685 }
2686
2687 static cl_command_queue getQueue(const Queue& q)
2688 {
2689     cl_command_queue qq = (cl_command_queue)q.ptr();
2690     if(!qq)
2691         qq = (cl_command_queue)Queue::getDefault().ptr();
2692     return qq;
2693 }
2694
2695 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2696
2697 KernelArg::KernelArg()
2698     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2699 {
2700 }
2701
2702 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2703     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2704 {
2705 }
2706
2707 KernelArg KernelArg::Constant(const Mat& m)
2708 {
2709     CV_Assert(m.isContinuous());
2710     return KernelArg(CONSTANT, 0, 0, 0, m.data, m.total()*m.elemSize());
2711 }
2712
2713 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2714
2715 struct Kernel::Impl
2716 {
2717     Impl(const char* kname, const Program& prog) :
2718         refcount(1), e(0), nu(0)
2719     {
2720         cl_program ph = (cl_program)prog.ptr();
2721         cl_int retval = 0;
2722         handle = ph != 0 ?
2723             clCreateKernel(ph, kname, &retval) : 0;
2724         CV_OclDbgAssert(retval == CL_SUCCESS);
2725         for( int i = 0; i < MAX_ARRS; i++ )
2726             u[i] = 0;
2727         haveTempDstUMats = false;
2728     }
2729
2730     void cleanupUMats()
2731     {
2732         for( int i = 0; i < MAX_ARRS; i++ )
2733             if( u[i] )
2734             {
2735                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2736                     u[i]->currAllocator->deallocate(u[i]);
2737                 u[i] = 0;
2738             }
2739         nu = 0;
2740         haveTempDstUMats = false;
2741     }
2742
2743     void addUMat(const UMat& m, bool dst)
2744     {
2745         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2746         u[nu] = m.u;
2747         CV_XADD(&m.u->urefcount, 1);
2748         nu++;
2749         if(dst && m.u->tempUMat())
2750             haveTempDstUMats = true;
2751     }
2752
2753     void addImage(const Image2D& image)
2754     {
2755         images.push_back(image);
2756     }
2757
2758     void finit()
2759     {
2760         cleanupUMats();
2761         images.clear();
2762         if(e) { clReleaseEvent(e); e = 0; }
2763         release();
2764     }
2765
2766     ~Impl()
2767     {
2768         if(handle)
2769             clReleaseKernel(handle);
2770     }
2771
2772     IMPLEMENT_REFCOUNTABLE();
2773
2774     cl_kernel handle;
2775     cl_event e;
2776     enum { MAX_ARRS = 16 };
2777     UMatData* u[MAX_ARRS];
2778     int nu;
2779     std::list<Image2D> images;
2780     bool haveTempDstUMats;
2781 };
2782
2783 }}
2784
2785 extern "C"
2786 {
2787 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
2788 {
2789     ((cv::ocl::Kernel::Impl*)p)->finit();
2790 }
2791
2792 }
2793
2794 namespace cv { namespace ocl {
2795
2796 Kernel::Kernel()
2797 {
2798     p = 0;
2799 }
2800
2801 Kernel::Kernel(const char* kname, const Program& prog)
2802 {
2803     p = 0;
2804     create(kname, prog);
2805 }
2806
2807 Kernel::Kernel(const char* kname, const ProgramSource& src,
2808                const String& buildopts, String* errmsg)
2809 {
2810     p = 0;
2811     create(kname, src, buildopts, errmsg);
2812 }
2813
2814 Kernel::Kernel(const Kernel& k)
2815 {
2816     p = k.p;
2817     if(p)
2818         p->addref();
2819 }
2820
2821 Kernel& Kernel::operator = (const Kernel& k)
2822 {
2823     Impl* newp = (Impl*)k.p;
2824     if(newp)
2825         newp->addref();
2826     if(p)
2827         p->release();
2828     p = newp;
2829     return *this;
2830 }
2831
2832 Kernel::~Kernel()
2833 {
2834     if(p)
2835         p->release();
2836 }
2837
2838 bool Kernel::create(const char* kname, const Program& prog)
2839 {
2840     if(p)
2841         p->release();
2842     p = new Impl(kname, prog);
2843     if(p->handle == 0)
2844     {
2845         p->release();
2846         p = 0;
2847     }
2848     return p != 0;
2849 }
2850
2851 bool Kernel::create(const char* kname, const ProgramSource& src,
2852                     const String& buildopts, String* errmsg)
2853 {
2854     if(p)
2855     {
2856         p->release();
2857         p = 0;
2858     }
2859     String tempmsg;
2860     if( !errmsg ) errmsg = &tempmsg;
2861     const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2862     return create(kname, prog);
2863 }
2864
2865 void* Kernel::ptr() const
2866 {
2867     return p ? p->handle : 0;
2868 }
2869
2870 bool Kernel::empty() const
2871 {
2872     return ptr() == 0;
2873 }
2874
2875 int Kernel::set(int i, const void* value, size_t sz)
2876 {
2877     if (!p || !p->handle)
2878         return -1;
2879     if (i < 0)
2880         return i;
2881     if( i == 0 )
2882         p->cleanupUMats();
2883
2884     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2885     CV_OclDbgAssert(retval == CL_SUCCESS);
2886     if (retval != CL_SUCCESS)
2887         return -1;
2888     return i+1;
2889 }
2890
2891 int Kernel::set(int i, const Image2D& image2D)
2892 {
2893     p->addImage(image2D);
2894     cl_mem h = (cl_mem)image2D.ptr();
2895     return set(i, &h, sizeof(h));
2896 }
2897
2898 int Kernel::set(int i, const UMat& m)
2899 {
2900     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
2901 }
2902
2903 int Kernel::set(int i, const KernelArg& arg)
2904 {
2905     if( !p || !p->handle )
2906         return -1;
2907     if (i < 0)
2908         return i;
2909     if( i == 0 )
2910         p->cleanupUMats();
2911     if( arg.m )
2912     {
2913         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
2914                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
2915         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
2916         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
2917
2918         if (!h)
2919         {
2920             p->release();
2921             p = 0;
2922             return -1;
2923         }
2924
2925         if (ptronly)
2926             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h) == CL_SUCCESS);
2927         else if( arg.m->dims <= 2 )
2928         {
2929             UMat2D u2d(*arg.m);
2930             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2931             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
2932             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
2933             i += 3;
2934
2935             if( !(arg.flags & KernelArg::NO_SIZE) )
2936             {
2937                 int cols = u2d.cols*arg.wscale/arg.iwscale;
2938                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
2939                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
2940                 i += 2;
2941             }
2942         }
2943         else
2944         {
2945             UMat3D u3d(*arg.m);
2946             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2947             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
2948             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
2949             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
2950             i += 4;
2951             if( !(arg.flags & KernelArg::NO_SIZE) )
2952             {
2953                 int cols = u3d.cols*arg.wscale/arg.iwscale;
2954                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
2955                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
2956                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
2957                 i += 3;
2958             }
2959         }
2960         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
2961         return i;
2962     }
2963     CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
2964     return i+1;
2965 }
2966
2967
2968 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
2969                  bool sync, const Queue& q)
2970 {
2971     if(!p || !p->handle || p->e != 0)
2972         return false;
2973
2974     cl_command_queue qq = getQueue(q);
2975     size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
2976     size_t total = 1;
2977     CV_Assert(_globalsize != 0);
2978     for (int i = 0; i < dims; i++)
2979     {
2980         size_t val = _localsize ? _localsize[i] :
2981             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
2982         CV_Assert( val > 0 );
2983         total *= _globalsize[i];
2984         globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
2985     }
2986     if( total == 0 )
2987         return true;
2988     if( p->haveTempDstUMats )
2989         sync = true;
2990     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
2991                                            offset, globalsize, _localsize, 0, 0,
2992                                            sync ? 0 : &p->e);
2993     if( sync || retval != CL_SUCCESS )
2994     {
2995         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
2996         p->cleanupUMats();
2997     }
2998     else
2999     {
3000         p->addref();
3001         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3002     }
3003     return retval == CL_SUCCESS;
3004 }
3005
3006 bool Kernel::runTask(bool sync, const Queue& q)
3007 {
3008     if(!p || !p->handle || p->e != 0)
3009         return false;
3010
3011     cl_command_queue qq = getQueue(q);
3012     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3013     if( sync || retval != CL_SUCCESS )
3014     {
3015         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3016         p->cleanupUMats();
3017     }
3018     else
3019     {
3020         p->addref();
3021         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3022     }
3023     return retval == CL_SUCCESS;
3024 }
3025
3026
3027 size_t Kernel::workGroupSize() const
3028 {
3029     if(!p || !p->handle)
3030         return 0;
3031     size_t val = 0, retsz = 0;
3032     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3033     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3034                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3035 }
3036
3037 size_t Kernel::preferedWorkGroupSizeMultiple() const
3038 {
3039     if(!p || !p->handle)
3040         return 0;
3041     size_t val = 0, retsz = 0;
3042     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3043     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3044                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3045 }
3046
3047 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3048 {
3049     if(!p || !p->handle || !wsz)
3050         return 0;
3051     size_t retsz = 0;
3052     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3053     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3054                                     sizeof(wsz[0]*3), wsz, &retsz) == CL_SUCCESS;
3055 }
3056
3057 size_t Kernel::localMemSize() const
3058 {
3059     if(!p || !p->handle)
3060         return 0;
3061     size_t retsz = 0;
3062     cl_ulong val = 0;
3063     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3064     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3065                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3066 }
3067
3068 /////////////////////////////////////////// Program /////////////////////////////////////////////
3069
3070 struct Program::Impl
3071 {
3072     Impl(const ProgramSource& _src,
3073          const String& _buildflags, String& errmsg)
3074     {
3075         refcount = 1;
3076         const Context& ctx = Context::getDefault();
3077         src = _src;
3078         buildflags = _buildflags;
3079         const String& srcstr = src.source();
3080         const char* srcptr = srcstr.c_str();
3081         size_t srclen = srcstr.size();
3082         cl_int retval = 0;
3083
3084         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3085         if( handle && retval == CL_SUCCESS )
3086         {
3087             int i, n = (int)ctx.ndevices();
3088             AutoBuffer<void*> deviceListBuf(n+1);
3089             void** deviceList = deviceListBuf;
3090             for( i = 0; i < n; i++ )
3091                 deviceList[i] = ctx.device(i).ptr();
3092
3093             Device device = Device::getDefault();
3094             if (device.isAMD())
3095                 buildflags += " -D AMD_DEVICE";
3096             else if (device.isIntel())
3097                 buildflags += " -D INTEL_DEVICE";
3098
3099             retval = clBuildProgram(handle, n,
3100                                     (const cl_device_id*)deviceList,
3101                                     buildflags.c_str(), 0, 0);
3102 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3103             if( retval != CL_SUCCESS )
3104 #endif
3105             {
3106                 size_t retsz = 0;
3107                 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3108                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3109                 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3110                 {
3111                     AutoBuffer<char> bufbuf(retsz + 16);
3112                     char* buf = bufbuf;
3113                     buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3114                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3115                     if (buildInfo_retval == CL_SUCCESS)
3116                     {
3117                         // TODO It is useful to see kernel name & program file name also
3118                         errmsg = String(buf);
3119                         printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3120                         fflush(stdout);
3121                     }
3122                 }
3123                 if (retval != CL_SUCCESS && handle)
3124                 {
3125                     clReleaseProgram(handle);
3126                     handle = NULL;
3127                 }
3128             }
3129         }
3130     }
3131
3132     Impl(const String& _buf, const String& _buildflags)
3133     {
3134         refcount = 1;
3135         handle = 0;
3136         buildflags = _buildflags;
3137         if(_buf.empty())
3138             return;
3139         String prefix0 = Program::getPrefix(buildflags);
3140         const Context& ctx = Context::getDefault();
3141         const Device& dev = Device::getDefault();
3142         const char* pos0 = _buf.c_str();
3143         const char* pos1 = strchr(pos0, '\n');
3144         if(!pos1)
3145             return;
3146         const char* pos2 = strchr(pos1+1, '\n');
3147         if(!pos2)
3148             return;
3149         const char* pos3 = strchr(pos2+1, '\n');
3150         if(!pos3)
3151             return;
3152         size_t prefixlen = (pos3 - pos0)+1;
3153         String prefix(pos0, prefixlen);
3154         if( prefix != prefix0 )
3155             return;
3156         const uchar* bin = (uchar*)(pos3+1);
3157         void* devid = dev.ptr();
3158         size_t codelen = _buf.length() - prefixlen;
3159         cl_int binstatus = 0, retval = 0;
3160         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3161                                            &codelen, &bin, &binstatus, &retval);
3162         CV_OclDbgAssert(retval == CL_SUCCESS);
3163     }
3164
3165     String store()
3166     {
3167         if(!handle)
3168             return String();
3169         size_t progsz = 0, retsz = 0;
3170         String prefix = Program::getPrefix(buildflags);
3171         size_t prefixlen = prefix.length();
3172         if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3173             return String();
3174         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3175         uchar* buf = bufbuf;
3176         memcpy(buf, prefix.c_str(), prefixlen);
3177         buf += prefixlen;
3178         if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3179             return String();
3180         buf[progsz] = (uchar)'\0';
3181         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3182     }
3183
3184     ~Impl()
3185     {
3186         if( handle )
3187         {
3188 #ifdef _WIN32
3189             if (!cv::__termination)
3190 #endif
3191             {
3192                 clReleaseProgram(handle);
3193             }
3194             handle = NULL;
3195         }
3196     }
3197
3198     IMPLEMENT_REFCOUNTABLE();
3199
3200     ProgramSource src;
3201     String buildflags;
3202     cl_program handle;
3203 };
3204
3205
3206 Program::Program() { p = 0; }
3207
3208 Program::Program(const ProgramSource& src,
3209         const String& buildflags, String& errmsg)
3210 {
3211     p = 0;
3212     create(src, buildflags, errmsg);
3213 }
3214
3215 Program::Program(const Program& prog)
3216 {
3217     p = prog.p;
3218     if(p)
3219         p->addref();
3220 }
3221
3222 Program& Program::operator = (const Program& prog)
3223 {
3224     Impl* newp = (Impl*)prog.p;
3225     if(newp)
3226         newp->addref();
3227     if(p)
3228         p->release();
3229     p = newp;
3230     return *this;
3231 }
3232
3233 Program::~Program()
3234 {
3235     if(p)
3236         p->release();
3237 }
3238
3239 bool Program::create(const ProgramSource& src,
3240             const String& buildflags, String& errmsg)
3241 {
3242     if(p)
3243         p->release();
3244     p = new Impl(src, buildflags, errmsg);
3245     if(!p->handle)
3246     {
3247         p->release();
3248         p = 0;
3249     }
3250     return p != 0;
3251 }
3252
3253 const ProgramSource& Program::source() const
3254 {
3255     static ProgramSource dummy;
3256     return p ? p->src : dummy;
3257 }
3258
3259 void* Program::ptr() const
3260 {
3261     return p ? p->handle : 0;
3262 }
3263
3264 bool Program::read(const String& bin, const String& buildflags)
3265 {
3266     if(p)
3267         p->release();
3268     p = new Impl(bin, buildflags);
3269     return p->handle != 0;
3270 }
3271
3272 bool Program::write(String& bin) const
3273 {
3274     if(!p)
3275         return false;
3276     bin = p->store();
3277     return !bin.empty();
3278 }
3279
3280 String Program::getPrefix() const
3281 {
3282     if(!p)
3283         return String();
3284     return getPrefix(p->buildflags);
3285 }
3286
3287 String Program::getPrefix(const String& buildflags)
3288 {
3289     const Context& ctx = Context::getDefault();
3290     const Device& dev = ctx.device(0);
3291     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3292                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3293 }
3294
3295 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3296
3297 struct ProgramSource::Impl
3298 {
3299     Impl(const char* _src)
3300     {
3301         init(String(_src));
3302     }
3303     Impl(const String& _src)
3304     {
3305         init(_src);
3306     }
3307     void init(const String& _src)
3308     {
3309         refcount = 1;
3310         src = _src;
3311         h = crc64((uchar*)src.c_str(), src.size());
3312     }
3313
3314     IMPLEMENT_REFCOUNTABLE();
3315     String src;
3316     ProgramSource::hash_t h;
3317 };
3318
3319
3320 ProgramSource::ProgramSource()
3321 {
3322     p = 0;
3323 }
3324
3325 ProgramSource::ProgramSource(const char* prog)
3326 {
3327     p = new Impl(prog);
3328 }
3329
3330 ProgramSource::ProgramSource(const String& prog)
3331 {
3332     p = new Impl(prog);
3333 }
3334
3335 ProgramSource::~ProgramSource()
3336 {
3337     if(p)
3338         p->release();
3339 }
3340
3341 ProgramSource::ProgramSource(const ProgramSource& prog)
3342 {
3343     p = prog.p;
3344     if(p)
3345         p->addref();
3346 }
3347
3348 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3349 {
3350     Impl* newp = (Impl*)prog.p;
3351     if(newp)
3352         newp->addref();
3353     if(p)
3354         p->release();
3355     p = newp;
3356     return *this;
3357 }
3358
3359 const String& ProgramSource::source() const
3360 {
3361     static String dummy;
3362     return p ? p->src : dummy;
3363 }
3364
3365 ProgramSource::hash_t ProgramSource::hash() const
3366 {
3367     return p ? p->h : 0;
3368 }
3369
3370 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3371
3372 class OpenCLBufferPool
3373 {
3374 protected:
3375     ~OpenCLBufferPool() { }
3376 public:
3377     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity) = 0;
3378     virtual void release(cl_mem handle, size_t capacity) = 0;
3379 };
3380
3381 class OpenCLBufferPoolImpl : public BufferPoolController, public OpenCLBufferPool
3382 {
3383 public:
3384     struct BufferEntry
3385     {
3386         cl_mem clBuffer_;
3387         size_t capacity_;
3388     };
3389 protected:
3390     Mutex mutex_;
3391
3392     size_t currentReservedSize;
3393     size_t maxReservedSize;
3394
3395     std::list<BufferEntry> reservedEntries_; // LRU order
3396
3397     // synchronized
3398     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3399     {
3400         if (reservedEntries_.empty())
3401             return false;
3402         std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3403         std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3404         BufferEntry result = {NULL, 0};
3405         size_t minDiff = (size_t)(-1);
3406         for (; i != reservedEntries_.end(); ++i)
3407         {
3408             BufferEntry& e = *i;
3409             if (e.capacity_ >= size)
3410             {
3411                 size_t diff = e.capacity_ - size;
3412                 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3413                 {
3414                     minDiff = diff;
3415                     result_pos = i;
3416                     result = e;
3417                     if (diff == 0)
3418                         break;
3419                 }
3420             }
3421         }
3422         if (result_pos != reservedEntries_.end())
3423         {
3424             //CV_DbgAssert(result == *result_pos);
3425             reservedEntries_.erase(result_pos);
3426             entry = result;
3427             currentReservedSize -= entry.capacity_;
3428             return true;
3429         }
3430         return false;
3431     }
3432
3433     // synchronized
3434     void _checkSizeOfReservedEntries()
3435     {
3436         while (currentReservedSize > maxReservedSize)
3437         {
3438             CV_DbgAssert(!reservedEntries_.empty());
3439             const BufferEntry& entry = reservedEntries_.back();
3440             CV_DbgAssert(currentReservedSize >= entry.capacity_);
3441             currentReservedSize -= entry.capacity_;
3442             _releaseBufferEntry(entry);
3443             reservedEntries_.pop_back();
3444         }
3445     }
3446
3447     inline size_t _allocationGranularity(size_t size)
3448     {
3449         // heuristic values
3450         if (size < 1024)
3451             return 16;
3452         else if (size < 64*1024)
3453             return 64;
3454         else if (size < 1024*1024)
3455             return 4096;
3456         else if (size < 16*1024*1024)
3457             return 64*1024;
3458         else
3459             return 1024*1024;
3460     }
3461
3462     void _allocateBufferEntry(BufferEntry& entry, size_t size)
3463     {
3464         CV_DbgAssert(entry.clBuffer_ == NULL);
3465         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3466         Context& ctx = Context::getDefault();
3467         cl_int retval = CL_SUCCESS;
3468         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE, entry.capacity_, 0, &retval);
3469         CV_Assert(retval == CL_SUCCESS);
3470         CV_Assert(entry.clBuffer_ != NULL);
3471         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3472                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3473     }
3474
3475     void _releaseBufferEntry(const BufferEntry& entry)
3476     {
3477         CV_Assert(entry.capacity_ != 0);
3478         CV_Assert(entry.clBuffer_ != NULL);
3479         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
3480                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
3481         clReleaseMemObject(entry.clBuffer_);
3482     }
3483 public:
3484     OpenCLBufferPoolImpl()
3485         : currentReservedSize(0), maxReservedSize(0)
3486     {
3487         // Note: Buffer pool is disabled by default,
3488         //       because we didn't receive significant performance improvement
3489         maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", 0);
3490     }
3491     virtual ~OpenCLBufferPoolImpl()
3492     {
3493         freeAllReservedBuffers();
3494         CV_Assert(reservedEntries_.empty());
3495     }
3496 public:
3497     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity)
3498     {
3499         BufferEntry entry = {NULL, 0};
3500         if (maxReservedSize > 0)
3501         {
3502             AutoLock locker(mutex_);
3503             if (_findAndRemoveEntryFromReservedList(entry, size))
3504             {
3505                 CV_DbgAssert(size <= entry.capacity_);
3506                 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3507                 capacity = entry.capacity_;
3508                 return entry.clBuffer_;
3509             }
3510         }
3511         _allocateBufferEntry(entry, size);
3512         capacity = entry.capacity_;
3513         return entry.clBuffer_;
3514     }
3515     virtual void release(cl_mem handle, size_t capacity)
3516     {
3517         BufferEntry entry = {handle, capacity};
3518         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3519         {
3520             _releaseBufferEntry(entry);
3521         }
3522         else
3523         {
3524             AutoLock locker(mutex_);
3525             reservedEntries_.push_front(entry);
3526             currentReservedSize += entry.capacity_;
3527             _checkSizeOfReservedEntries();
3528         }
3529     }
3530
3531     virtual size_t getReservedSize() const { return currentReservedSize; }
3532     virtual size_t getMaxReservedSize() const { return maxReservedSize; }
3533     virtual void setMaxReservedSize(size_t size)
3534     {
3535         AutoLock locker(mutex_);
3536         size_t oldMaxReservedSize = maxReservedSize;
3537         maxReservedSize = size;
3538         if (maxReservedSize < oldMaxReservedSize)
3539         {
3540             std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3541             for (; i != reservedEntries_.end();)
3542             {
3543                 const BufferEntry& entry = *i;
3544                 if (entry.capacity_ > maxReservedSize / 8)
3545                 {
3546                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
3547                     currentReservedSize -= entry.capacity_;
3548                     _releaseBufferEntry(entry);
3549                     i = reservedEntries_.erase(i);
3550                     continue;
3551                 }
3552                 ++i;
3553             }
3554             _checkSizeOfReservedEntries();
3555         }
3556     }
3557     virtual void freeAllReservedBuffers()
3558     {
3559         AutoLock locker(mutex_);
3560         std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3561         for (; i != reservedEntries_.end(); ++i)
3562         {
3563             const BufferEntry& entry = *i;
3564             _releaseBufferEntry(entry);
3565         }
3566         reservedEntries_.clear();
3567     }
3568 };
3569
3570 #if defined _MSC_VER
3571 #pragma warning(disable:4127) // conditional expression is constant
3572 #endif
3573 template <bool readAccess, bool writeAccess>
3574 class AlignedDataPtr
3575 {
3576 protected:
3577     const size_t size_;
3578     uchar* const originPtr_;
3579     const size_t alignment_;
3580     uchar* ptr_;
3581     uchar* allocatedPtr_;
3582
3583 public:
3584     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
3585         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
3586     {
3587         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
3588         if (((size_t)ptr_ & (alignment - 1)) != 0)
3589         {
3590             allocatedPtr_ = new uchar[size_ + alignment - 1];
3591             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
3592             if (readAccess)
3593             {
3594                 memcpy(ptr_, originPtr_, size_);
3595             }
3596         }
3597     }
3598
3599     uchar* getAlignedPtr() const
3600     {
3601         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
3602         return ptr_;
3603     }
3604
3605     ~AlignedDataPtr()
3606     {
3607         if (allocatedPtr_)
3608         {
3609             if (writeAccess)
3610             {
3611                 memcpy(originPtr_, ptr_, size_);
3612             }
3613             delete[] allocatedPtr_;
3614             allocatedPtr_ = NULL;
3615         }
3616         ptr_ = NULL;
3617     }
3618 private:
3619     AlignedDataPtr(const AlignedDataPtr&); // disabled
3620     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
3621 };
3622 #if defined _MSC_VER
3623 #pragma warning(default:4127) // conditional expression is constant
3624 #endif
3625
3626 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
3627 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
3628 #endif
3629
3630 class OpenCLAllocator : public MatAllocator
3631 {
3632     mutable OpenCLBufferPoolImpl bufferPool;
3633     enum AllocatorFlags
3634     {
3635         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0
3636     };
3637 public:
3638     OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
3639
3640     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
3641             int flags, UMatUsageFlags usageFlags) const
3642     {
3643         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
3644         return u;
3645     }
3646
3647     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
3648     {
3649         const Device& dev = ctx.device(0);
3650         createFlags = 0;
3651         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
3652             createFlags |= CL_MEM_ALLOC_HOST_PTR;
3653
3654         if( dev.hostUnifiedMemory() )
3655             flags0 = 0;
3656         else
3657             flags0 = UMatData::COPY_ON_MAP;
3658     }
3659
3660     UMatData* allocate(int dims, const int* sizes, int type,
3661                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
3662     {
3663         if(!useOpenCL())
3664             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3665         CV_Assert(data == 0);
3666         size_t total = CV_ELEM_SIZE(type);
3667         for( int i = dims-1; i >= 0; i-- )
3668         {
3669             if( step )
3670                 step[i] = total;
3671             total *= sizes[i];
3672         }
3673
3674         Context& ctx = Context::getDefault();
3675         int createFlags = 0, flags0 = 0;
3676         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
3677
3678         size_t capacity = 0;
3679         void* handle = NULL;
3680         int allocatorFlags = 0;
3681         if (createFlags == 0)
3682         {
3683             handle = bufferPool.allocate(total, capacity);
3684             if (!handle)
3685                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3686             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
3687         }
3688         else
3689         {
3690             capacity = total;
3691             cl_int retval = 0;
3692             handle = clCreateBuffer((cl_context)ctx.ptr(),
3693                                           CL_MEM_READ_WRITE|createFlags, total, 0, &retval);
3694             if( !handle || retval != CL_SUCCESS )
3695                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3696         }
3697         UMatData* u = new UMatData(this);
3698         u->data = 0;
3699         u->size = total;
3700         u->capacity = capacity;
3701         u->handle = handle;
3702         u->flags = flags0;
3703         u->allocatorFlags_ = allocatorFlags;
3704         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
3705         return u;
3706     }
3707
3708     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
3709     {
3710         if(!u)
3711             return false;
3712
3713         UMatDataAutoLock lock(u);
3714
3715         if(u->handle == 0)
3716         {
3717             CV_Assert(u->origdata != 0);
3718             Context& ctx = Context::getDefault();
3719             int createFlags = 0, flags0 = 0;
3720             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
3721
3722             cl_context ctx_handle = (cl_context)ctx.ptr();
3723             cl_int retval = 0;
3724             int tempUMatFlags = UMatData::TEMP_UMAT;
3725             u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,
3726                                        u->size, u->origdata, &retval);
3727             if((!u->handle || retval != CL_SUCCESS) && !(accessFlags & ACCESS_FAST))
3728             {
3729                 u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
3730                                            u->size, u->origdata, &retval);
3731                 tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
3732             }
3733             if(!u->handle || retval != CL_SUCCESS)
3734                 return false;
3735             u->prevAllocator = u->currAllocator;
3736             u->currAllocator = this;
3737             u->flags |= tempUMatFlags;
3738         }
3739         if(accessFlags & ACCESS_WRITE)
3740             u->markHostCopyObsolete(true);
3741         return true;
3742     }
3743
3744     /*void sync(UMatData* u) const
3745     {
3746         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3747         UMatDataAutoLock lock(u);
3748
3749         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
3750         {
3751             if( u->tempCopiedUMat() )
3752             {
3753                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3754                                     u->size, u->origdata, 0, 0, 0);
3755             }
3756             else
3757             {
3758                 cl_int retval = 0;
3759                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3760                                                 (CL_MAP_READ | CL_MAP_WRITE),
3761                                                 0, u->size, 0, 0, 0, &retval);
3762                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
3763                 clFinish(q);
3764             }
3765             u->markHostCopyObsolete(false);
3766         }
3767         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
3768         {
3769             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3770                                  u->size, u->data, 0, 0, 0);
3771         }
3772     }*/
3773
3774     void deallocate(UMatData* u) const
3775     {
3776         if(!u)
3777             return;
3778
3779         CV_Assert(u->urefcount >= 0);
3780         CV_Assert(u->refcount >= 0);
3781
3782         // TODO: !!! when we add Shared Virtual Memory Support,
3783         // this function (as well as the others) should be corrected
3784         CV_Assert(u->handle != 0 && u->urefcount == 0);
3785         if(u->tempUMat())
3786         {
3787 //            UMatDataAutoLock lock(u);
3788             if( u->hostCopyObsolete() && u->refcount > 0 )
3789             {
3790                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3791                 if( u->tempCopiedUMat() )
3792                 {
3793                     AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3794                     CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3795                                         u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
3796                 }
3797                 else
3798                 {
3799                     cl_int retval = 0;
3800                     void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3801                                                     (CL_MAP_READ | CL_MAP_WRITE),
3802                                                     0, u->size, 0, 0, 0, &retval);
3803                     CV_OclDbgAssert(retval == CL_SUCCESS);
3804                     CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
3805                     CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3806                 }
3807             }
3808             u->markHostCopyObsolete(false);
3809             clReleaseMemObject((cl_mem)u->handle);
3810             u->handle = 0;
3811             u->currAllocator = u->prevAllocator;
3812             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3813                 fastFree(u->data);
3814             u->data = u->origdata;
3815             if(u->refcount == 0)
3816                 u->currAllocator->deallocate(u);
3817         }
3818         else
3819         {
3820             CV_Assert(u->refcount == 0);
3821             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3822             {
3823                 fastFree(u->data);
3824                 u->data = 0;
3825             }
3826             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
3827             {
3828                 bufferPool.release((cl_mem)u->handle, u->capacity);
3829             }
3830             else
3831             {
3832                 clReleaseMemObject((cl_mem)u->handle);
3833             }
3834             u->handle = 0;
3835             u->capacity = 0;
3836             delete u;
3837         }
3838     }
3839
3840     void map(UMatData* u, int accessFlags) const
3841     {
3842         if(!u)
3843             return;
3844
3845         CV_Assert( u->handle != 0 );
3846
3847         UMatDataAutoLock autolock(u);
3848
3849         if(accessFlags & ACCESS_WRITE)
3850             u->markDeviceCopyObsolete(true);
3851
3852         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3853
3854         // FIXIT Workaround for UMat synchronization issue
3855         // if( u->refcount == 0 )
3856         {
3857             if( !u->copyOnMap() )
3858             {
3859                 if (u->data) // FIXIT Workaround for UMat synchronization issue
3860                 {
3861                     //CV_Assert(u->hostCopyObsolete() == false);
3862                     return;
3863                 }
3864                 // because there can be other map requests for the same UMat with different access flags,
3865                 // we use the universal (read-write) access mode.
3866                 cl_int retval = 0;
3867                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3868                                                      (CL_MAP_READ | CL_MAP_WRITE),
3869                                                      0, u->size, 0, 0, 0, &retval);
3870                 if(u->data && retval == CL_SUCCESS)
3871                 {
3872                     u->markHostCopyObsolete(false);
3873                     return;
3874                 }
3875
3876                 // if map failed, switch to copy-on-map mode for the particular buffer
3877                 u->flags |= UMatData::COPY_ON_MAP;
3878             }
3879
3880             if(!u->data)
3881             {
3882                 u->data = (uchar*)fastMalloc(u->size);
3883                 u->markHostCopyObsolete(true);
3884             }
3885         }
3886
3887         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
3888         {
3889             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3890             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3891                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
3892             u->markHostCopyObsolete(false);
3893         }
3894     }
3895
3896     void unmap(UMatData* u) const
3897     {
3898         if(!u)
3899             return;
3900
3901         CV_Assert(u->handle != 0);
3902
3903         UMatDataAutoLock autolock(u);
3904
3905         // FIXIT Workaround for UMat synchronization issue
3906         if(u->refcount > 0)
3907             return;
3908
3909         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3910         cl_int retval = 0;
3911         if( !u->copyOnMap() && u->data )
3912         {
3913             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
3914                                 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
3915             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3916             u->data = 0;
3917         }
3918         else if( u->copyOnMap() && u->deviceCopyObsolete() )
3919         {
3920             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3921             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3922                                 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
3923         }
3924         u->markDeviceCopyObsolete(false);
3925         u->markHostCopyObsolete(false);
3926     }
3927
3928     bool checkContinuous(int dims, const size_t sz[],
3929                          const size_t srcofs[], const size_t srcstep[],
3930                          const size_t dstofs[], const size_t dststep[],
3931                          size_t& total, size_t new_sz[],
3932                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
3933                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
3934     {
3935         bool iscontinuous = true;
3936         srcrawofs = srcofs ? srcofs[dims-1] : 0;
3937         dstrawofs = dstofs ? dstofs[dims-1] : 0;
3938         total = sz[dims-1];
3939         for( int i = dims-2; i >= 0; i-- )
3940         {
3941             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
3942                 iscontinuous = false;
3943             total *= sz[i];
3944             if( srcofs )
3945                 srcrawofs += srcofs[i]*srcstep[i];
3946             if( dstofs )
3947                 dstrawofs += dstofs[i]*dststep[i];
3948         }
3949
3950         if( !iscontinuous )
3951         {
3952             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
3953             if( dims == 2 )
3954             {
3955                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
3956                 // we assume that new_... arrays are initialized by caller
3957                 // with 0's, so there is no else branch
3958                 if( srcofs )
3959                 {
3960                     new_srcofs[0] = srcofs[1];
3961                     new_srcofs[1] = srcofs[0];
3962                     new_srcofs[2] = 0;
3963                 }
3964
3965                 if( dstofs )
3966                 {
3967                     new_dstofs[0] = dstofs[1];
3968                     new_dstofs[1] = dstofs[0];
3969                     new_dstofs[2] = 0;
3970                 }
3971
3972                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
3973                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
3974             }
3975             else
3976             {
3977                 // we could check for dims == 3 here,
3978                 // but from user perspective this one is more informative
3979                 CV_Assert(dims <= 3);
3980                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
3981                 if( srcofs )
3982                 {
3983                     new_srcofs[0] = srcofs[2];
3984                     new_srcofs[1] = srcofs[1];
3985                     new_srcofs[2] = srcofs[0];
3986                 }
3987
3988                 if( dstofs )
3989                 {
3990                     new_dstofs[0] = dstofs[2];
3991                     new_dstofs[1] = dstofs[1];
3992                     new_dstofs[2] = dstofs[0];
3993                 }
3994
3995                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
3996                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
3997             }
3998         }
3999         return iscontinuous;
4000     }
4001
4002     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4003                   const size_t srcofs[], const size_t srcstep[],
4004                   const size_t dststep[]) const
4005     {
4006         if(!u)
4007             return;
4008         UMatDataAutoLock autolock(u);
4009
4010         if( u->data && !u->hostCopyObsolete() )
4011         {
4012             Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4013             return;
4014         }
4015         CV_Assert( u->handle != 0 );
4016
4017         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4018
4019         size_t total = 0, new_sz[] = {0, 0, 0};
4020         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4021         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4022
4023         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4024                                             total, new_sz,
4025                                             srcrawofs, new_srcofs, new_srcstep,
4026                                             dstrawofs, new_dstofs, new_dststep);
4027
4028         AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4029         if( iscontinuous )
4030         {
4031             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4032                                            srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4033         }
4034         else
4035         {
4036             CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4037                             new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4038                             new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4039         }
4040     }
4041
4042     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4043                 const size_t dstofs[], const size_t dststep[],
4044                 const size_t srcstep[]) const
4045     {
4046         if(!u)
4047             return;
4048
4049         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4050         CV_Assert(u->refcount == 0 || u->tempUMat());
4051
4052         size_t total = 0, new_sz[] = {0, 0, 0};
4053         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4054         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4055
4056         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4057                                             total, new_sz,
4058                                             srcrawofs, new_srcofs, new_srcstep,
4059                                             dstrawofs, new_dstofs, new_dststep);
4060
4061         UMatDataAutoLock autolock(u);
4062
4063         // if there is cached CPU copy of the GPU matrix,
4064         // we could use it as a destination.
4065         // we can do it in 2 cases:
4066         //    1. we overwrite the whole content
4067         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
4068         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4069         {
4070             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4071             u->markHostCopyObsolete(false);
4072             u->markDeviceCopyObsolete(true);
4073             return;
4074         }
4075
4076         CV_Assert( u->handle != 0 );
4077         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4078
4079         AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4080         if( iscontinuous )
4081         {
4082             CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4083                 CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) == CL_SUCCESS );
4084         }
4085         else
4086         {
4087             CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4088                 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4089                 new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) == CL_SUCCESS );
4090         }
4091
4092         u->markHostCopyObsolete(true);
4093         u->markDeviceCopyObsolete(false);
4094     }
4095
4096     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4097               const size_t srcofs[], const size_t srcstep[],
4098               const size_t dstofs[], const size_t dststep[], bool _sync) const
4099     {
4100         if(!src || !dst)
4101             return;
4102
4103         size_t total = 0, new_sz[] = {0, 0, 0};
4104         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4105         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4106
4107         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4108                                             total, new_sz,
4109                                             srcrawofs, new_srcofs, new_srcstep,
4110                                             dstrawofs, new_dstofs, new_dststep);
4111
4112         UMatDataAutoLock src_autolock(src);
4113         UMatDataAutoLock dst_autolock(dst);
4114
4115         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
4116         {
4117             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
4118             return;
4119         }
4120         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
4121         {
4122             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
4123             dst->markHostCopyObsolete(false);
4124             dst->markDeviceCopyObsolete(true);
4125             return;
4126         }
4127
4128         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4129         CV_Assert(dst->refcount == 0);
4130         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4131
4132         if( iscontinuous )
4133         {
4134             CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4135                                            srcrawofs, dstrawofs, total, 0, 0, 0) == CL_SUCCESS );
4136         }
4137         else
4138         {
4139             cl_int retval;
4140             CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4141                                                new_srcofs, new_dstofs, new_sz,
4142                                                new_srcstep[0], new_srcstep[1],
4143                                                new_dststep[0], new_dststep[1],
4144                                                0, 0, 0)) == CL_SUCCESS );
4145         }
4146
4147         dst->markHostCopyObsolete(true);
4148         dst->markDeviceCopyObsolete(false);
4149
4150         if( _sync )
4151         {
4152             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4153         }
4154     }
4155
4156     BufferPoolController* getBufferPoolController() const { return &bufferPool; }
4157
4158     MatAllocator* matStdAllocator;
4159 };
4160
4161 MatAllocator* getOpenCLAllocator()
4162 {
4163     static MatAllocator * allocator = new OpenCLAllocator();
4164     return allocator;
4165 }
4166
4167 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
4168
4169 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
4170 {
4171     cl_uint numDevices = 0;
4172     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4173                                 0, NULL, &numDevices) == CL_SUCCESS);
4174
4175     if (numDevices == 0)
4176     {
4177         devices.clear();
4178         return;
4179     }
4180
4181     devices.resize((size_t)numDevices);
4182     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4183                                 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
4184 }
4185
4186 struct PlatformInfo::Impl
4187 {
4188     Impl(void* id)
4189     {
4190         refcount = 1;
4191         handle = *(cl_platform_id*)id;
4192         getDevices(devices, handle);
4193     }
4194
4195     String getStrProp(cl_device_info prop) const
4196     {
4197         char buf[1024];
4198         size_t sz=0;
4199         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
4200             sz < sizeof(buf) ? String(buf) : String();
4201     }
4202
4203     IMPLEMENT_REFCOUNTABLE();
4204     std::vector<cl_device_id> devices;
4205     cl_platform_id handle;
4206 };
4207
4208 PlatformInfo::PlatformInfo()
4209 {
4210     p = 0;
4211 }
4212
4213 PlatformInfo::PlatformInfo(void* platform_id)
4214 {
4215     p = new Impl(platform_id);
4216 }
4217
4218 PlatformInfo::~PlatformInfo()
4219 {
4220     if(p)
4221         p->release();
4222 }
4223
4224 PlatformInfo::PlatformInfo(const PlatformInfo& i)
4225 {
4226     if (i.p)
4227         i.p->addref();
4228     p = i.p;
4229 }
4230
4231 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
4232 {
4233     if (i.p != p)
4234     {
4235         if (i.p)
4236             i.p->addref();
4237         if (p)
4238             p->release();
4239         p = i.p;
4240     }
4241     return *this;
4242 }
4243
4244 int PlatformInfo::deviceNumber() const
4245 {
4246     return p ? (int)p->devices.size() : 0;
4247 }
4248
4249 void PlatformInfo::getDevice(Device& device, int d) const
4250 {
4251     CV_Assert(p && d < (int)p->devices.size() );
4252     if(p)
4253         device.set(p->devices[d]);
4254 }
4255
4256 String PlatformInfo::name() const
4257 {
4258     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
4259 }
4260
4261 String PlatformInfo::vendor() const
4262 {
4263     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
4264 }
4265
4266 String PlatformInfo::version() const
4267 {
4268     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
4269 }
4270
4271 static void getPlatforms(std::vector<cl_platform_id>& platforms)
4272 {
4273     cl_uint numPlatforms = 0;
4274     CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
4275
4276     if (numPlatforms == 0)
4277     {
4278         platforms.clear();
4279         return;
4280     }
4281
4282     platforms.resize((size_t)numPlatforms);
4283     CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
4284 }
4285
4286 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
4287 {
4288     std::vector<cl_platform_id> platforms;
4289     getPlatforms(platforms);
4290
4291     for (size_t i = 0; i < platforms.size(); i++)
4292         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
4293 }
4294
4295 const char* typeToStr(int type)
4296 {
4297     static const char* tab[]=
4298     {
4299         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4300         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4301         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4302         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4303         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4304         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
4305         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
4306         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4307     };
4308     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4309     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4310 }
4311
4312 const char* memopTypeToStr(int type)
4313 {
4314     static const char* tab[] =
4315     {
4316         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4317         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4318         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4319         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4320         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4321         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4322         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
4323         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4324     };
4325     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4326     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4327 }
4328
4329 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
4330 {
4331     if( sdepth == ddepth )
4332         return "noconvert";
4333     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
4334     if( ddepth >= CV_32F ||
4335         (ddepth == CV_32S && sdepth < CV_32S) ||
4336         (ddepth == CV_16S && sdepth <= CV_8S) ||
4337         (ddepth == CV_16U && sdepth == CV_8U))
4338     {
4339         sprintf(buf, "convert_%s", typestr);
4340     }
4341     else if( sdepth >= CV_32F )
4342         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
4343     else
4344         sprintf(buf, "convert_%s_sat", typestr);
4345
4346     return buf;
4347 }
4348
4349 template <typename T>
4350 static std::string kerToStr(const Mat & k)
4351 {
4352     int width = k.cols - 1, depth = k.depth();
4353     const T * const data = reinterpret_cast<const T *>(k.data);
4354
4355     std::ostringstream stream;
4356     stream.precision(10);
4357
4358     if (depth <= CV_8S)
4359     {
4360         for (int i = 0; i < width; ++i)
4361             stream << "DIG(" << (int)data[i] << ")";
4362         stream << "DIG(" << (int)data[width] << ")";
4363     }
4364     else if (depth == CV_32F)
4365     {
4366         stream.setf(std::ios_base::showpoint);
4367         for (int i = 0; i < width; ++i)
4368             stream << "DIG(" << data[i] << "f)";
4369         stream << "DIG(" << data[width] << "f)";
4370     }
4371     else
4372     {
4373         for (int i = 0; i < width; ++i)
4374             stream << "DIG(" << data[i] << ")";
4375         stream << "DIG(" << data[width] << ")";
4376     }
4377
4378     return stream.str();
4379 }
4380
4381 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
4382 {
4383     Mat kernel = _kernel.getMat().reshape(1, 1);
4384
4385     int depth = kernel.depth();
4386     if (ddepth < 0)
4387         ddepth = depth;
4388
4389     if (ddepth != depth)
4390         kernel.convertTo(kernel, ddepth);
4391
4392     typedef std::string (* func_t)(const Mat &);
4393     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
4394                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
4395     const func_t func = funcs[ddepth];
4396     CV_Assert(func != 0);
4397
4398     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
4399 }
4400
4401 #define PROCESS_SRC(src) \
4402     do \
4403     { \
4404         if (!src.empty()) \
4405         { \
4406             CV_Assert(src.isMat() || src.isUMat()); \
4407             int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
4408             Size csize = src.size(); \
4409             cols.push_back(ccn * csize.width); \
4410             if (ctype != type) \
4411                 return 1; \
4412             offsets.push_back(src.offset()); \
4413             steps.push_back(src.step()); \
4414         } \
4415     } \
4416     while ((void)0, 0)
4417
4418 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
4419                               InputArray src4, InputArray src5, InputArray src6,
4420                               InputArray src7, InputArray src8, InputArray src9)
4421 {
4422     int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth);
4423     Size ssize = src1.size();
4424     const ocl::Device & d = ocl::Device::getDefault();
4425
4426     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
4427         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
4428         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
4429         d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
4430     if (d.isIntel())
4431     {
4432         // it's heuristic
4433         int vectorWidthsIntel[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
4434         kercn = vectorWidthsIntel[depth];
4435     }
4436
4437     if (ssize.width * cn < kercn || kercn <= 0)
4438         return 1;
4439
4440     std::vector<size_t> offsets, steps, cols;
4441     PROCESS_SRC(src1);
4442     PROCESS_SRC(src2);
4443     PROCESS_SRC(src3);
4444     PROCESS_SRC(src4);
4445     PROCESS_SRC(src5);
4446     PROCESS_SRC(src6);
4447     PROCESS_SRC(src7);
4448     PROCESS_SRC(src8);
4449     PROCESS_SRC(src9);
4450
4451     size_t size = offsets.size();
4452     int wsz = kercn * esz1;
4453     std::vector<int> dividers(size, wsz);
4454
4455     for (size_t i = 0; i < size; ++i)
4456         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
4457             dividers[i] >>= 1;
4458
4459     // default strategy
4460     for (size_t i = 0; i < size; ++i)
4461         if (dividers[i] != wsz)
4462         {
4463             kercn = 1;
4464             break;
4465         }
4466
4467     // another strategy
4468 //    width = *std::min_element(dividers.begin(), dividers.end());
4469
4470     return kercn;
4471 }
4472
4473 #undef PROCESS_SRC
4474
4475
4476 // TODO Make this as a method of OpenCL "BuildOptions" class
4477 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
4478 {
4479     if (!buildOptions.empty())
4480         buildOptions += " ";
4481     int type = _m.type(), depth = CV_MAT_DEPTH(type);
4482     buildOptions += format(
4483             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
4484             name.c_str(), ocl::typeToStr(type),
4485             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
4486             name.c_str(), (int)CV_MAT_CN(type),
4487             name.c_str(), (int)CV_ELEM_SIZE(type),
4488             name.c_str(), (int)CV_ELEM_SIZE1(type),
4489             name.c_str(), (int)depth
4490             );
4491 }
4492
4493
4494 struct Image2D::Impl
4495 {
4496     Impl(const UMat &src, bool norm, bool alias)
4497     {
4498         handle = 0;
4499         refcount = 1;
4500         init(src, norm, alias);
4501     }
4502
4503     ~Impl()
4504     {
4505         if (handle)
4506             clReleaseMemObject(handle);
4507     }
4508
4509     static cl_image_format getImageFormat(int depth, int cn, bool norm)
4510     {
4511         cl_image_format format;
4512         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
4513                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
4514         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
4515                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
4516         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
4517
4518         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
4519         int channelOrder = channelOrders[cn];
4520         format.image_channel_data_type = (cl_channel_type)channelType;
4521         format.image_channel_order = (cl_channel_order)channelOrder;
4522         return format;
4523     }
4524
4525     static bool isFormatSupported(cl_image_format format)
4526     {
4527         cl_context context = (cl_context)Context::getDefault().ptr();
4528         // Figure out how many formats are supported by this context.
4529         cl_uint numFormats = 0;
4530         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4531                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
4532                                                 NULL, &numFormats);
4533         AutoBuffer<cl_image_format> formats(numFormats);
4534         err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4535                                          CL_MEM_OBJECT_IMAGE2D, numFormats,
4536                                          formats, NULL);
4537         CV_OclDbgAssert(err == CL_SUCCESS);
4538         for (cl_uint i = 0; i < numFormats; ++i)
4539         {
4540             if (!memcmp(&formats[i], &format, sizeof(format)))
4541             {
4542                 return true;
4543             }
4544         }
4545         return false;
4546     }
4547
4548     void init(const UMat &src, bool norm, bool alias)
4549     {
4550         CV_Assert(ocl::Device::getDefault().imageSupport());
4551
4552         int err, depth = src.depth(), cn = src.channels();
4553         CV_Assert(cn <= 4);
4554         cl_image_format format = getImageFormat(depth, cn, norm);
4555
4556         if (!isFormatSupported(format))
4557             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
4558
4559         cl_context context = (cl_context)Context::getDefault().ptr();
4560         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
4561
4562 #ifdef CL_VERSION_1_2
4563         // this enables backwards portability to
4564         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
4565         const Device & d = ocl::Device::getDefault();
4566         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
4567         CV_Assert(!alias || canCreateAlias(src));
4568         if (1 < major || (1 == major && 2 <= minor))
4569         {
4570             cl_image_desc desc;
4571             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
4572             desc.image_width      = src.cols;
4573             desc.image_height     = src.rows;
4574             desc.image_depth      = 0;
4575             desc.image_array_size = 1;
4576             desc.image_row_pitch  = alias ? src.step[0] : 0;
4577             desc.image_slice_pitch = 0;
4578             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
4579             desc.num_mip_levels   = 0;
4580             desc.num_samples      = 0;
4581             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
4582         }
4583         else
4584 #endif
4585         {
4586             CV_SUPPRESS_DEPRECATED_START
4587             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
4588             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
4589             CV_SUPPRESS_DEPRECATED_END
4590         }
4591         CV_OclDbgAssert(err == CL_SUCCESS);
4592
4593         size_t origin[] = { 0, 0, 0 };
4594         size_t region[] = { src.cols, src.rows, 1 };
4595
4596         cl_mem devData;
4597         if (!alias && !src.isContinuous())
4598         {
4599             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
4600             CV_OclDbgAssert(err == CL_SUCCESS);
4601
4602             const size_t roi[3] = {src.cols * src.elemSize(), src.rows, 1};
4603             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
4604                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
4605             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4606         }
4607         else
4608         {
4609             devData = (cl_mem)src.handle(ACCESS_READ);
4610         }
4611         CV_Assert(devData != NULL);
4612
4613         if (!alias)
4614         {
4615             CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
4616             if (!src.isContinuous())
4617             {
4618                 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4619                 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
4620             }
4621         }
4622     }
4623
4624     IMPLEMENT_REFCOUNTABLE();
4625
4626     cl_mem handle;
4627 };
4628
4629 Image2D::Image2D()
4630 {
4631     p = NULL;
4632 }
4633
4634 Image2D::Image2D(const UMat &src, bool norm, bool alias)
4635 {
4636     p = new Impl(src, norm, alias);
4637 }
4638
4639 bool Image2D::canCreateAlias(const UMat &m)
4640 {
4641     bool ret = false;
4642     const Device & d = ocl::Device::getDefault();
4643     if (d.imageFromBufferSupport())
4644     {
4645         // This is the required pitch alignment in pixels
4646         uint pitchAlign = d.imagePitchAlignment();
4647         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
4648         {
4649             // We don't currently handle the case where the buffer was created
4650             // with CL_MEM_USE_HOST_PTR
4651             if (!m.u->tempUMat())
4652             {
4653                 ret = true;
4654             }
4655         }
4656     }
4657     return ret;
4658 }
4659
4660 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
4661 {
4662     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
4663
4664     return Impl::isFormatSupported(format);
4665 }
4666
4667 Image2D::Image2D(const Image2D & i)
4668 {
4669     p = i.p;
4670     if (p)
4671         p->addref();
4672 }
4673
4674 Image2D & Image2D::operator = (const Image2D & i)
4675 {
4676     if (i.p != p)
4677     {
4678         if (i.p)
4679             i.p->addref();
4680         if (p)
4681             p->release();
4682         p = i.p;
4683     }
4684     return *this;
4685 }
4686
4687 Image2D::~Image2D()
4688 {
4689     if (p)
4690         p->release();
4691 }
4692
4693 void* Image2D::ptr() const
4694 {
4695     return p ? p->handle : 0;
4696 }
4697
4698 }}