Merge pull request #2947 from vbystricky:capDShow
[profile/ivi/opencv.git] / modules / core / src / ocl.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include <list>
44 #include <map>
45 #include <string>
46 #include <sstream>
47 #include <iostream> // std::cerr
48
49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
50
51 #include "opencv2/core/bufferpool.hpp"
52 #ifndef LOG_BUFFER_POOL
53 # if 0
54 #   define LOG_BUFFER_POOL printf
55 # else
56 #   define LOG_BUFFER_POOL(...)
57 # endif
58 #endif
59
60 // TODO Move to some common place
61 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
62 {
63 #ifdef HAVE_WINRT
64     const char* envValue = NULL;
65 #else
66     const char* envValue = getenv(name);
67 #endif
68     if (envValue == NULL)
69     {
70         return defaultValue;
71     }
72     cv::String value = envValue;
73     size_t pos = 0;
74     for (; pos < value.size(); pos++)
75     {
76         if (!isdigit(value[pos]))
77             break;
78     }
79     cv::String valueStr = value.substr(0, pos);
80     cv::String suffixStr = value.substr(pos, value.length() - pos);
81     int v = atoi(valueStr.c_str());
82     if (suffixStr.length() == 0)
83         return v;
84     else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
85         return v * 1024 * 1024;
86     else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
87         return v * 1024;
88     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
89 }
90
91 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
92 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
93
94 #ifdef HAVE_OPENCL
95 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
96 #else
97 // TODO FIXIT: This file can't be build without OPENCL
98
99 /*
100   Part of the file is an extract from the standard OpenCL headers from Khronos site.
101   Below is the original copyright.
102 */
103
104 /*******************************************************************************
105  * Copyright (c) 2008 - 2012 The Khronos Group Inc.
106  *
107  * Permission is hereby granted, free of charge, to any person obtaining a
108  * copy of this software and/or associated documentation files (the
109  * "Materials"), to deal in the Materials without restriction, including
110  * without limitation the rights to use, copy, modify, merge, publish,
111  * distribute, sublicense, and/or sell copies of the Materials, and to
112  * permit persons to whom the Materials are furnished to do so, subject to
113  * the following conditions:
114  *
115  * The above copyright notice and this permission notice shall be included
116  * in all copies or substantial portions of the Materials.
117  *
118  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
119  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
120  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
121  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
122  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
123  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
124  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
125  ******************************************************************************/
126
127 #if 0 //defined __APPLE__
128 #define HAVE_OPENCL 1
129 #else
130 #undef HAVE_OPENCL
131 #endif
132
133 #define OPENCV_CL_NOT_IMPLEMENTED -1000
134
135 #ifdef HAVE_OPENCL
136
137 #if defined __APPLE__
138 #include <OpenCL/opencl.h>
139 #else
140 #include <CL/opencl.h>
141 #endif
142
143 static const bool g_haveOpenCL = true;
144
145 #else
146
147 extern "C" {
148
149 struct _cl_platform_id { int dummy; };
150 struct _cl_device_id { int dummy; };
151 struct _cl_context { int dummy; };
152 struct _cl_command_queue { int dummy; };
153 struct _cl_mem { int dummy; };
154 struct _cl_program { int dummy; };
155 struct _cl_kernel { int dummy; };
156 struct _cl_event { int dummy; };
157 struct _cl_sampler { int dummy; };
158
159 typedef struct _cl_platform_id *    cl_platform_id;
160 typedef struct _cl_device_id *      cl_device_id;
161 typedef struct _cl_context *        cl_context;
162 typedef struct _cl_command_queue *  cl_command_queue;
163 typedef struct _cl_mem *            cl_mem;
164 typedef struct _cl_program *        cl_program;
165 typedef struct _cl_kernel *         cl_kernel;
166 typedef struct _cl_event *          cl_event;
167 typedef struct _cl_sampler *        cl_sampler;
168
169 typedef int cl_int;
170 typedef unsigned cl_uint;
171 #if defined (_WIN32) && defined(_MSC_VER)
172     typedef __int64 cl_long;
173     typedef unsigned __int64 cl_ulong;
174 #else
175     typedef long cl_long;
176     typedef unsigned long cl_ulong;
177 #endif
178
179 typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
180 typedef cl_ulong            cl_bitfield;
181 typedef cl_bitfield         cl_device_type;
182 typedef cl_uint             cl_platform_info;
183 typedef cl_uint             cl_device_info;
184 typedef cl_bitfield         cl_device_fp_config;
185 typedef cl_uint             cl_device_mem_cache_type;
186 typedef cl_uint             cl_device_local_mem_type;
187 typedef cl_bitfield         cl_device_exec_capabilities;
188 typedef cl_bitfield         cl_command_queue_properties;
189 typedef intptr_t            cl_device_partition_property;
190 typedef cl_bitfield         cl_device_affinity_domain;
191
192 typedef intptr_t            cl_context_properties;
193 typedef cl_uint             cl_context_info;
194 typedef cl_uint             cl_command_queue_info;
195 typedef cl_uint             cl_channel_order;
196 typedef cl_uint             cl_channel_type;
197 typedef cl_bitfield         cl_mem_flags;
198 typedef cl_uint             cl_mem_object_type;
199 typedef cl_uint             cl_mem_info;
200 typedef cl_bitfield         cl_mem_migration_flags;
201 typedef cl_uint             cl_image_info;
202 typedef cl_uint             cl_buffer_create_type;
203 typedef cl_uint             cl_addressing_mode;
204 typedef cl_uint             cl_filter_mode;
205 typedef cl_uint             cl_sampler_info;
206 typedef cl_bitfield         cl_map_flags;
207 typedef cl_uint             cl_program_info;
208 typedef cl_uint             cl_program_build_info;
209 typedef cl_uint             cl_program_binary_type;
210 typedef cl_int              cl_build_status;
211 typedef cl_uint             cl_kernel_info;
212 typedef cl_uint             cl_kernel_arg_info;
213 typedef cl_uint             cl_kernel_arg_address_qualifier;
214 typedef cl_uint             cl_kernel_arg_access_qualifier;
215 typedef cl_bitfield         cl_kernel_arg_type_qualifier;
216 typedef cl_uint             cl_kernel_work_group_info;
217 typedef cl_uint             cl_event_info;
218 typedef cl_uint             cl_command_type;
219 typedef cl_uint             cl_profiling_info;
220
221
222 typedef struct _cl_image_format {
223     cl_channel_order        image_channel_order;
224     cl_channel_type         image_channel_data_type;
225 } cl_image_format;
226
227 typedef struct _cl_image_desc {
228     cl_mem_object_type      image_type;
229     size_t                  image_width;
230     size_t                  image_height;
231     size_t                  image_depth;
232     size_t                  image_array_size;
233     size_t                  image_row_pitch;
234     size_t                  image_slice_pitch;
235     cl_uint                 num_mip_levels;
236     cl_uint                 num_samples;
237     cl_mem                  buffer;
238 } cl_image_desc;
239
240 typedef struct _cl_buffer_region {
241     size_t                  origin;
242     size_t                  size;
243 } cl_buffer_region;
244
245
246 //////////////////////////////////////////////////////////
247
248 #define CL_SUCCESS                                  0
249 #define CL_DEVICE_NOT_FOUND                         -1
250 #define CL_DEVICE_NOT_AVAILABLE                     -2
251 #define CL_COMPILER_NOT_AVAILABLE                   -3
252 #define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
253 #define CL_OUT_OF_RESOURCES                         -5
254 #define CL_OUT_OF_HOST_MEMORY                       -6
255 #define CL_PROFILING_INFO_NOT_AVAILABLE             -7
256 #define CL_MEM_COPY_OVERLAP                         -8
257 #define CL_IMAGE_FORMAT_MISMATCH                    -9
258 #define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
259 #define CL_BUILD_PROGRAM_FAILURE                    -11
260 #define CL_MAP_FAILURE                              -12
261 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
262 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
263 #define CL_COMPILE_PROGRAM_FAILURE                  -15
264 #define CL_LINKER_NOT_AVAILABLE                     -16
265 #define CL_LINK_PROGRAM_FAILURE                     -17
266 #define CL_DEVICE_PARTITION_FAILED                  -18
267 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
268
269 #define CL_INVALID_VALUE                            -30
270 #define CL_INVALID_DEVICE_TYPE                      -31
271 #define CL_INVALID_PLATFORM                         -32
272 #define CL_INVALID_DEVICE                           -33
273 #define CL_INVALID_CONTEXT                          -34
274 #define CL_INVALID_QUEUE_PROPERTIES                 -35
275 #define CL_INVALID_COMMAND_QUEUE                    -36
276 #define CL_INVALID_HOST_PTR                         -37
277 #define CL_INVALID_MEM_OBJECT                       -38
278 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
279 #define CL_INVALID_IMAGE_SIZE                       -40
280 #define CL_INVALID_SAMPLER                          -41
281 #define CL_INVALID_BINARY                           -42
282 #define CL_INVALID_BUILD_OPTIONS                    -43
283 #define CL_INVALID_PROGRAM                          -44
284 #define CL_INVALID_PROGRAM_EXECUTABLE               -45
285 #define CL_INVALID_KERNEL_NAME                      -46
286 #define CL_INVALID_KERNEL_DEFINITION                -47
287 #define CL_INVALID_KERNEL                           -48
288 #define CL_INVALID_ARG_INDEX                        -49
289 #define CL_INVALID_ARG_VALUE                        -50
290 #define CL_INVALID_ARG_SIZE                         -51
291 #define CL_INVALID_KERNEL_ARGS                      -52
292 #define CL_INVALID_WORK_DIMENSION                   -53
293 #define CL_INVALID_WORK_GROUP_SIZE                  -54
294 #define CL_INVALID_WORK_ITEM_SIZE                   -55
295 #define CL_INVALID_GLOBAL_OFFSET                    -56
296 #define CL_INVALID_EVENT_WAIT_LIST                  -57
297 #define CL_INVALID_EVENT                            -58
298 #define CL_INVALID_OPERATION                        -59
299 #define CL_INVALID_GL_OBJECT                        -60
300 #define CL_INVALID_BUFFER_SIZE                      -61
301 #define CL_INVALID_MIP_LEVEL                        -62
302 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
303 #define CL_INVALID_PROPERTY                         -64
304 #define CL_INVALID_IMAGE_DESCRIPTOR                 -65
305 #define CL_INVALID_COMPILER_OPTIONS                 -66
306 #define CL_INVALID_LINKER_OPTIONS                   -67
307 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
308
309 /*#define CL_VERSION_1_0                              1
310 #define CL_VERSION_1_1                              1
311 #define CL_VERSION_1_2                              1*/
312
313 #define CL_FALSE                                    0
314 #define CL_TRUE                                     1
315 #define CL_BLOCKING                                 CL_TRUE
316 #define CL_NON_BLOCKING                             CL_FALSE
317
318 #define CL_PLATFORM_PROFILE                         0x0900
319 #define CL_PLATFORM_VERSION                         0x0901
320 #define CL_PLATFORM_NAME                            0x0902
321 #define CL_PLATFORM_VENDOR                          0x0903
322 #define CL_PLATFORM_EXTENSIONS                      0x0904
323
324 #define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
325 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
326 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
327 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
328 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
329 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
330 #define CL_DEVICE_TYPE                              0x1000
331 #define CL_DEVICE_VENDOR_ID                         0x1001
332 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
333 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
334 #define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
335 #define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
336 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
337 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
338 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
339 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
340 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
341 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
342 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
343 #define CL_DEVICE_ADDRESS_BITS                      0x100D
344 #define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
345 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
346 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
347 #define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
348 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
349 #define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
350 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
351 #define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
352 #define CL_DEVICE_IMAGE_SUPPORT                     0x1016
353 #define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
354 #define CL_DEVICE_MAX_SAMPLERS                      0x1018
355 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
356 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
357 #define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
358 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
359 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
360 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
361 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
362 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
363 #define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
364 #define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
365 #define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
366 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
367 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
368 #define CL_DEVICE_ENDIAN_LITTLE                     0x1026
369 #define CL_DEVICE_AVAILABLE                         0x1027
370 #define CL_DEVICE_COMPILER_AVAILABLE                0x1028
371 #define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
372 #define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
373 #define CL_DEVICE_NAME                              0x102B
374 #define CL_DEVICE_VENDOR                            0x102C
375 #define CL_DRIVER_VERSION                           0x102D
376 #define CL_DEVICE_PROFILE                           0x102E
377 #define CL_DEVICE_VERSION                           0x102F
378 #define CL_DEVICE_EXTENSIONS                        0x1030
379 #define CL_DEVICE_PLATFORM                          0x1031
380 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
381 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
382 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
383 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
384 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
385 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
386 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
387 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
388 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
389 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
390 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
391 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
392 #define CL_DEVICE_LINKER_AVAILABLE                  0x103E
393 #define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
394 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
395 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
396 #define CL_DEVICE_PARENT_DEVICE                     0x1042
397 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
398 #define CL_DEVICE_PARTITION_PROPERTIES              0x1044
399 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
400 #define CL_DEVICE_PARTITION_TYPE                    0x1046
401 #define CL_DEVICE_REFERENCE_COUNT                   0x1047
402 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
403 #define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
404 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
405 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
406
407 #define CL_FP_DENORM                                (1 << 0)
408 #define CL_FP_INF_NAN                               (1 << 1)
409 #define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
410 #define CL_FP_ROUND_TO_ZERO                         (1 << 3)
411 #define CL_FP_ROUND_TO_INF                          (1 << 4)
412 #define CL_FP_FMA                                   (1 << 5)
413 #define CL_FP_SOFT_FLOAT                            (1 << 6)
414 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
415
416 #define CL_NONE                                     0x0
417 #define CL_READ_ONLY_CACHE                          0x1
418 #define CL_READ_WRITE_CACHE                         0x2
419 #define CL_LOCAL                                    0x1
420 #define CL_GLOBAL                                   0x2
421 #define CL_EXEC_KERNEL                              (1 << 0)
422 #define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
423 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
424 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
425
426 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
427 #define CL_CONTEXT_DEVICES                          0x1081
428 #define CL_CONTEXT_PROPERTIES                       0x1082
429 #define CL_CONTEXT_NUM_DEVICES                      0x1083
430 #define CL_CONTEXT_PLATFORM                         0x1084
431 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
432
433 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
434 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
435 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
436 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
437 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
438 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
439 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
440 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
441 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
442 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
443 #define CL_QUEUE_CONTEXT                            0x1090
444 #define CL_QUEUE_DEVICE                             0x1091
445 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
446 #define CL_QUEUE_PROPERTIES                         0x1093
447 #define CL_MEM_READ_WRITE                           (1 << 0)
448 #define CL_MEM_WRITE_ONLY                           (1 << 1)
449 #define CL_MEM_READ_ONLY                            (1 << 2)
450 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
451 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
452 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
453 // reserved                                         (1 << 6)
454 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
455 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
456 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
457 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
458 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
459
460 #define CL_R                                        0x10B0
461 #define CL_A                                        0x10B1
462 #define CL_RG                                       0x10B2
463 #define CL_RA                                       0x10B3
464 #define CL_RGB                                      0x10B4
465 #define CL_RGBA                                     0x10B5
466 #define CL_BGRA                                     0x10B6
467 #define CL_ARGB                                     0x10B7
468 #define CL_INTENSITY                                0x10B8
469 #define CL_LUMINANCE                                0x10B9
470 #define CL_Rx                                       0x10BA
471 #define CL_RGx                                      0x10BB
472 #define CL_RGBx                                     0x10BC
473 #define CL_DEPTH                                    0x10BD
474 #define CL_DEPTH_STENCIL                            0x10BE
475
476 #define CL_SNORM_INT8                               0x10D0
477 #define CL_SNORM_INT16                              0x10D1
478 #define CL_UNORM_INT8                               0x10D2
479 #define CL_UNORM_INT16                              0x10D3
480 #define CL_UNORM_SHORT_565                          0x10D4
481 #define CL_UNORM_SHORT_555                          0x10D5
482 #define CL_UNORM_INT_101010                         0x10D6
483 #define CL_SIGNED_INT8                              0x10D7
484 #define CL_SIGNED_INT16                             0x10D8
485 #define CL_SIGNED_INT32                             0x10D9
486 #define CL_UNSIGNED_INT8                            0x10DA
487 #define CL_UNSIGNED_INT16                           0x10DB
488 #define CL_UNSIGNED_INT32                           0x10DC
489 #define CL_HALF_FLOAT                               0x10DD
490 #define CL_FLOAT                                    0x10DE
491 #define CL_UNORM_INT24                              0x10DF
492
493 #define CL_MEM_OBJECT_BUFFER                        0x10F0
494 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
495 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
496 #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
497 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
498 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
499 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
500
501 #define CL_MEM_TYPE                                 0x1100
502 #define CL_MEM_FLAGS                                0x1101
503 #define CL_MEM_SIZE                                 0x1102
504 #define CL_MEM_HOST_PTR                             0x1103
505 #define CL_MEM_MAP_COUNT                            0x1104
506 #define CL_MEM_REFERENCE_COUNT                      0x1105
507 #define CL_MEM_CONTEXT                              0x1106
508 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
509 #define CL_MEM_OFFSET                               0x1108
510
511 #define CL_IMAGE_FORMAT                             0x1110
512 #define CL_IMAGE_ELEMENT_SIZE                       0x1111
513 #define CL_IMAGE_ROW_PITCH                          0x1112
514 #define CL_IMAGE_SLICE_PITCH                        0x1113
515 #define CL_IMAGE_WIDTH                              0x1114
516 #define CL_IMAGE_HEIGHT                             0x1115
517 #define CL_IMAGE_DEPTH                              0x1116
518 #define CL_IMAGE_ARRAY_SIZE                         0x1117
519 #define CL_IMAGE_BUFFER                             0x1118
520 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
521 #define CL_IMAGE_NUM_SAMPLES                        0x111A
522
523 #define CL_ADDRESS_NONE                             0x1130
524 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
525 #define CL_ADDRESS_CLAMP                            0x1132
526 #define CL_ADDRESS_REPEAT                           0x1133
527 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
528
529 #define CL_FILTER_NEAREST                           0x1140
530 #define CL_FILTER_LINEAR                            0x1141
531
532 #define CL_SAMPLER_REFERENCE_COUNT                  0x1150
533 #define CL_SAMPLER_CONTEXT                          0x1151
534 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
535 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
536 #define CL_SAMPLER_FILTER_MODE                      0x1154
537
538 #define CL_MAP_READ                                 (1 << 0)
539 #define CL_MAP_WRITE                                (1 << 1)
540 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
541
542 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
543 #define CL_PROGRAM_CONTEXT                          0x1161
544 #define CL_PROGRAM_NUM_DEVICES                      0x1162
545 #define CL_PROGRAM_DEVICES                          0x1163
546 #define CL_PROGRAM_SOURCE                           0x1164
547 #define CL_PROGRAM_BINARY_SIZES                     0x1165
548 #define CL_PROGRAM_BINARIES                         0x1166
549 #define CL_PROGRAM_NUM_KERNELS                      0x1167
550 #define CL_PROGRAM_KERNEL_NAMES                     0x1168
551 #define CL_PROGRAM_BUILD_STATUS                     0x1181
552 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
553 #define CL_PROGRAM_BUILD_LOG                        0x1183
554 #define CL_PROGRAM_BINARY_TYPE                      0x1184
555 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
556 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
557 #define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
558 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
559
560 #define CL_BUILD_SUCCESS                            0
561 #define CL_BUILD_NONE                               -1
562 #define CL_BUILD_ERROR                              -2
563 #define CL_BUILD_IN_PROGRESS                        -3
564
565 #define CL_KERNEL_FUNCTION_NAME                     0x1190
566 #define CL_KERNEL_NUM_ARGS                          0x1191
567 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
568 #define CL_KERNEL_CONTEXT                           0x1193
569 #define CL_KERNEL_PROGRAM                           0x1194
570 #define CL_KERNEL_ATTRIBUTES                        0x1195
571 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
572 #define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
573 #define CL_KERNEL_ARG_TYPE_NAME                     0x1198
574 #define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
575 #define CL_KERNEL_ARG_NAME                          0x119A
576 #define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
577 #define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
578 #define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
579 #define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
580 #define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
581 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
582 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
583 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
584 #define CL_KERNEL_ARG_TYPE_NONE                     0
585 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
586 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
587 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
588 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
589 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
590 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
591 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
592 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
593 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
594
595 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
596 #define CL_EVENT_COMMAND_TYPE                       0x11D1
597 #define CL_EVENT_REFERENCE_COUNT                    0x11D2
598 #define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
599 #define CL_EVENT_CONTEXT                            0x11D4
600
601 #define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
602 #define CL_COMMAND_TASK                             0x11F1
603 #define CL_COMMAND_NATIVE_KERNEL                    0x11F2
604 #define CL_COMMAND_READ_BUFFER                      0x11F3
605 #define CL_COMMAND_WRITE_BUFFER                     0x11F4
606 #define CL_COMMAND_COPY_BUFFER                      0x11F5
607 #define CL_COMMAND_READ_IMAGE                       0x11F6
608 #define CL_COMMAND_WRITE_IMAGE                      0x11F7
609 #define CL_COMMAND_COPY_IMAGE                       0x11F8
610 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
611 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
612 #define CL_COMMAND_MAP_BUFFER                       0x11FB
613 #define CL_COMMAND_MAP_IMAGE                        0x11FC
614 #define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
615 #define CL_COMMAND_MARKER                           0x11FE
616 #define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
617 #define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
618 #define CL_COMMAND_READ_BUFFER_RECT                 0x1201
619 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
620 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
621 #define CL_COMMAND_USER                             0x1204
622 #define CL_COMMAND_BARRIER                          0x1205
623 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
624 #define CL_COMMAND_FILL_BUFFER                      0x1207
625 #define CL_COMMAND_FILL_IMAGE                       0x1208
626
627 #define CL_COMPLETE                                 0x0
628 #define CL_RUNNING                                  0x1
629 #define CL_SUBMITTED                                0x2
630 #define CL_QUEUED                                   0x3
631 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
632
633 #define CL_PROFILING_COMMAND_QUEUED                 0x1280
634 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
635 #define CL_PROFILING_COMMAND_START                  0x1282
636 #define CL_PROFILING_COMMAND_END                    0x1283
637
638 #define CL_CALLBACK CV_STDCALL
639
640 static volatile bool g_haveOpenCL = false;
641 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
642
643 #if defined(__APPLE__)
644 #include <dlfcn.h>
645
646 static void* initOpenCLAndLoad(const char* funcname)
647 {
648     static bool initialized = false;
649     static void* handle = 0;
650     if (!handle)
651     {
652         if(!initialized)
653         {
654             const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
655             oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
656                 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
657             handle = dlopen(oclpath, RTLD_LAZY);
658             initialized = true;
659             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
660             if( g_haveOpenCL )
661                 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
662             else
663                 fprintf(stderr, "Failed to load OpenCL runtime\n");
664         }
665         if(!handle)
666             return 0;
667     }
668
669     return funcname && handle ? dlsym(handle, funcname) : 0;
670 }
671
672 #elif defined WIN32 || defined _WIN32
673
674 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
675   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
676 #endif
677 #include <windows.h>
678 #if (_WIN32_WINNT >= 0x0602)
679   #include <synchapi.h>
680 #endif
681 #undef small
682 #undef min
683 #undef max
684 #undef abs
685
686 static void* initOpenCLAndLoad(const char* funcname)
687 {
688     static bool initialized = false;
689     static HMODULE handle = 0;
690     if (!handle)
691     {
692 #ifndef HAVE_WINRT
693         if(!initialized)
694         {
695             handle = LoadLibraryA("OpenCL.dll");
696             initialized = true;
697             g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
698         }
699 #endif
700         if(!handle)
701             return 0;
702     }
703
704     return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
705 }
706
707 #elif defined(__linux)
708
709 #include <dlfcn.h>
710 #include <stdio.h>
711
712 static void* initOpenCLAndLoad(const char* funcname)
713 {
714     static bool initialized = false;
715     static void* handle = 0;
716     if (!handle)
717     {
718         if(!initialized)
719         {
720             handle = dlopen("libOpenCL.so", RTLD_LAZY);
721             if(!handle)
722                 handle = dlopen("libCL.so", RTLD_LAZY);
723             initialized = true;
724             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
725         }
726         if(!handle)
727             return 0;
728     }
729
730     return funcname ? (void*)dlsym(handle, funcname) : 0;
731 }
732
733 #else
734
735 static void* initOpenCLAndLoad(const char*)
736 {
737     return 0;
738 }
739
740 #endif
741
742
743 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
744     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
745     static rettype funcname argsdecl \
746     { \
747         static funcname##_t funcname##_p = 0; \
748         if( !funcname##_p ) \
749         { \
750             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
751             if( !funcname##_p ) \
752                 return OPENCV_CL_NOT_IMPLEMENTED; \
753         } \
754         return funcname##_p args; \
755     }
756
757
758 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
759     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
760     static rettype funcname argsdecl \
761     { \
762         static funcname##_t funcname##_p = 0; \
763         if( !funcname##_p ) \
764         { \
765             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
766             if( !funcname##_p ) \
767             { \
768                 if( errcode_ret ) \
769                     *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
770                 return 0; \
771             } \
772         } \
773         return funcname##_p args; \
774     }
775
776 OCL_FUNC(cl_int, clGetPlatformIDs,
777     (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
778     (num_entries, platforms, num_platforms))
779
780 OCL_FUNC(cl_int, clGetPlatformInfo,
781     (cl_platform_id platform, cl_platform_info param_name,
782     size_t param_value_size, void * param_value,
783     size_t * param_value_size_ret),
784     (platform, param_name, param_value_size, param_value, param_value_size_ret))
785
786 OCL_FUNC(cl_int, clGetDeviceInfo,
787          (cl_device_id device,
788           cl_device_info param_name,
789           size_t param_value_size,
790           void * param_value,
791           size_t * param_value_size_ret),
792          (device, param_name, param_value_size, param_value, param_value_size_ret))
793
794
795 OCL_FUNC(cl_int, clGetDeviceIDs,
796     (cl_platform_id platform,
797     cl_device_type device_type,
798     cl_uint num_entries,
799     cl_device_id * devices,
800     cl_uint * num_devices),
801     (platform, device_type, num_entries, devices, num_devices))
802
803 OCL_FUNC_P(cl_context, clCreateContext,
804     (const cl_context_properties * properties,
805     cl_uint num_devices,
806     const cl_device_id * devices,
807     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
808     void * user_data,
809     cl_int * errcode_ret),
810     (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
811
812 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
813
814 /*
815 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
816
817 OCL_FUNC_P(cl_context, clCreateContextFromType,
818     (const cl_context_properties * properties,
819     cl_device_type device_type,
820     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
821     void * user_data,
822     cl_int * errcode_ret),
823     (properties, device_type, pfn_notify, user_data, errcode_ret))
824
825 OCL_FUNC(cl_int, clGetContextInfo,
826     (cl_context context,
827     cl_context_info param_name,
828     size_t param_value_size,
829     void * param_value,
830     size_t * param_value_size_ret),
831     (context, param_name, param_value_size,
832     param_value, param_value_size_ret))
833 */
834 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
835     (cl_context context,
836     cl_device_id device,
837     cl_command_queue_properties properties,
838     cl_int * errcode_ret),
839     (context, device, properties, errcode_ret))
840
841 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
842
843 OCL_FUNC_P(cl_mem, clCreateBuffer,
844     (cl_context context,
845     cl_mem_flags flags,
846     size_t size,
847     void * host_ptr,
848     cl_int * errcode_ret),
849     (context, flags, size, host_ptr, errcode_ret))
850
851 /*
852 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
853
854 OCL_FUNC(cl_int, clGetCommandQueueInfo,
855  (cl_command_queue command_queue,
856  cl_command_queue_info param_name,
857  size_t param_value_size,
858  void * param_value,
859  size_t * param_value_size_ret),
860  (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
861
862 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
863     (cl_mem buffer,
864     cl_mem_flags flags,
865     cl_buffer_create_type buffer_create_type,
866     const void * buffer_create_info,
867     cl_int * errcode_ret),
868     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
869 */
870
871 OCL_FUNC_P(cl_mem, clCreateImage,
872     (cl_context context,
873     cl_mem_flags flags,
874     const cl_image_format * image_format,
875     const cl_image_desc * image_desc,
876     void * host_ptr,
877     cl_int * errcode_ret),
878     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
879
880 OCL_FUNC_P(cl_mem, clCreateImage2D,
881     (cl_context context,
882     cl_mem_flags flags,
883     const cl_image_format * image_format,
884     size_t image_width,
885     size_t image_height,
886     size_t image_row_pitch,
887     void * host_ptr,
888     cl_int *errcode_ret),
889     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
890
891 OCL_FUNC(cl_int, clGetSupportedImageFormats,
892  (cl_context context,
893  cl_mem_flags flags,
894  cl_mem_object_type image_type,
895  cl_uint num_entries,
896  cl_image_format * image_formats,
897  cl_uint * num_image_formats),
898  (context, flags, image_type, num_entries, image_formats, num_image_formats))
899
900 /*
901 OCL_FUNC(cl_int, clGetMemObjectInfo,
902  (cl_mem memobj,
903  cl_mem_info param_name,
904  size_t param_value_size,
905  void * param_value,
906  size_t * param_value_size_ret),
907  (memobj, param_name, param_value_size, param_value, param_value_size_ret))
908
909 OCL_FUNC(cl_int, clGetImageInfo,
910  (cl_mem image,
911  cl_image_info param_name,
912  size_t param_value_size,
913  void * param_value,
914  size_t * param_value_size_ret),
915  (image, param_name, param_value_size, param_value, param_value_size_ret))
916
917 OCL_FUNC(cl_int, clCreateKernelsInProgram,
918  (cl_program program,
919  cl_uint num_kernels,
920  cl_kernel * kernels,
921  cl_uint * num_kernels_ret),
922  (program, num_kernels, kernels, num_kernels_ret))
923
924 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
925
926 OCL_FUNC(cl_int, clGetKernelArgInfo,
927  (cl_kernel kernel,
928  cl_uint arg_indx,
929  cl_kernel_arg_info param_name,
930  size_t param_value_size,
931  void * param_value,
932  size_t * param_value_size_ret),
933  (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
934
935 OCL_FUNC(cl_int, clEnqueueReadImage,
936  (cl_command_queue command_queue,
937  cl_mem image,
938  cl_bool blocking_read,
939  const size_t * origin[3],
940  const size_t * region[3],
941  size_t row_pitch,
942  size_t slice_pitch,
943  void * ptr,
944  cl_uint num_events_in_wait_list,
945  const cl_event * event_wait_list,
946  cl_event * event),
947  (command_queue, image, blocking_read, origin, region,
948  row_pitch, slice_pitch,
949  ptr,
950  num_events_in_wait_list,
951  event_wait_list,
952  event))
953
954 OCL_FUNC(cl_int, clEnqueueWriteImage,
955  (cl_command_queue command_queue,
956  cl_mem image,
957  cl_bool blocking_write,
958  const size_t * origin[3],
959  const size_t * region[3],
960  size_t input_row_pitch,
961  size_t input_slice_pitch,
962  const void * ptr,
963  cl_uint num_events_in_wait_list,
964  const cl_event * event_wait_list,
965  cl_event * event),
966  (command_queue, image, blocking_write, origin, region, input_row_pitch,
967  input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
968
969 OCL_FUNC(cl_int, clEnqueueFillImage,
970  (cl_command_queue command_queue,
971  cl_mem image,
972  const void * fill_color,
973  const size_t * origin[3],
974  const size_t * region[3],
975  cl_uint num_events_in_wait_list,
976  const cl_event * event_wait_list,
977  cl_event * event),
978  (command_queue, image, fill_color, origin, region,
979  num_events_in_wait_list, event_wait_list, event))
980
981 OCL_FUNC(cl_int, clEnqueueCopyImage,
982  (cl_command_queue command_queue,
983  cl_mem src_image,
984  cl_mem dst_image,
985  const size_t * src_origin[3],
986  const size_t * dst_origin[3],
987  const size_t * region[3],
988  cl_uint num_events_in_wait_list,
989  const cl_event * event_wait_list,
990  cl_event * event),
991  (command_queue, src_image, dst_image, src_origin, dst_origin,
992  region, num_events_in_wait_list, event_wait_list, event))
993
994 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
995  (cl_command_queue command_queue,
996  cl_mem src_image,
997  cl_mem dst_buffer,
998  const size_t * src_origin[3],
999  const size_t * region[3],
1000  size_t dst_offset,
1001  cl_uint num_events_in_wait_list,
1002  const cl_event * event_wait_list,
1003  cl_event * event),
1004  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
1005  num_events_in_wait_list, event_wait_list, event))
1006 */
1007
1008 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
1009  (cl_command_queue command_queue,
1010  cl_mem src_buffer,
1011  cl_mem dst_image,
1012  size_t src_offset,
1013  const size_t dst_origin[3],
1014  const size_t region[3],
1015  cl_uint num_events_in_wait_list,
1016  const cl_event * event_wait_list,
1017  cl_event * event),
1018  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
1019  region, num_events_in_wait_list, event_wait_list, event))
1020
1021  OCL_FUNC(cl_int, clFlush,
1022  (cl_command_queue command_queue),
1023  (command_queue))
1024
1025 /*
1026 OCL_FUNC_P(void*, clEnqueueMapImage,
1027  (cl_command_queue command_queue,
1028  cl_mem image,
1029  cl_bool blocking_map,
1030  cl_map_flags map_flags,
1031  const size_t * origin[3],
1032  const size_t * region[3],
1033  size_t * image_row_pitch,
1034  size_t * image_slice_pitch,
1035  cl_uint num_events_in_wait_list,
1036  const cl_event * event_wait_list,
1037  cl_event * event,
1038  cl_int * errcode_ret),
1039  (command_queue, image, blocking_map, map_flags, origin, region,
1040  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
1041  event_wait_list, event, errcode_ret))
1042 */
1043
1044 /*
1045 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
1046
1047 OCL_FUNC(cl_int, clGetKernelInfo,
1048  (cl_kernel kernel,
1049  cl_kernel_info param_name,
1050  size_t param_value_size,
1051  void * param_value,
1052  size_t * param_value_size_ret),
1053  (kernel, param_name, param_value_size, param_value, param_value_size_ret))
1054
1055 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
1056
1057 */
1058
1059 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
1060
1061
1062 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
1063     (cl_context context,
1064     cl_uint count,
1065     const char ** strings,
1066     const size_t * lengths,
1067     cl_int * errcode_ret),
1068     (context, count, strings, lengths, errcode_ret))
1069
1070 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
1071     (cl_context context,
1072     cl_uint num_devices,
1073     const cl_device_id * device_list,
1074     const size_t * lengths,
1075     const unsigned char ** binaries,
1076     cl_int * binary_status,
1077     cl_int * errcode_ret),
1078     (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
1079
1080 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
1081
1082 OCL_FUNC(cl_int, clBuildProgram,
1083     (cl_program program,
1084     cl_uint num_devices,
1085     const cl_device_id * device_list,
1086     const char * options,
1087     void (CL_CALLBACK * pfn_notify)(cl_program, void *),
1088     void * user_data),
1089     (program, num_devices, device_list, options, pfn_notify, user_data))
1090
1091 OCL_FUNC(cl_int, clGetProgramInfo,
1092     (cl_program program,
1093     cl_program_info param_name,
1094     size_t param_value_size,
1095     void * param_value,
1096     size_t * param_value_size_ret),
1097     (program, param_name, param_value_size, param_value, param_value_size_ret))
1098
1099 OCL_FUNC(cl_int, clGetProgramBuildInfo,
1100     (cl_program program,
1101     cl_device_id device,
1102     cl_program_build_info param_name,
1103     size_t param_value_size,
1104     void * param_value,
1105     size_t * param_value_size_ret),
1106     (program, device, param_name, param_value_size, param_value, param_value_size_ret))
1107
1108 OCL_FUNC_P(cl_kernel, clCreateKernel,
1109     (cl_program program,
1110     const char * kernel_name,
1111     cl_int * errcode_ret),
1112     (program, kernel_name, errcode_ret))
1113
1114 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
1115
1116 OCL_FUNC(cl_int, clSetKernelArg,
1117     (cl_kernel kernel,
1118     cl_uint arg_index,
1119     size_t arg_size,
1120     const void * arg_value),
1121     (kernel, arg_index, arg_size, arg_value))
1122
1123 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
1124     (cl_kernel kernel,
1125     cl_device_id device,
1126     cl_kernel_work_group_info param_name,
1127     size_t param_value_size,
1128     void * param_value,
1129     size_t * param_value_size_ret),
1130     (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
1131
1132 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
1133
1134 OCL_FUNC(cl_int, clEnqueueReadBuffer,
1135     (cl_command_queue command_queue,
1136     cl_mem buffer,
1137     cl_bool blocking_read,
1138     size_t offset,
1139     size_t size,
1140     void * ptr,
1141     cl_uint num_events_in_wait_list,
1142     const cl_event * event_wait_list,
1143     cl_event * event),
1144     (command_queue, buffer, blocking_read, offset, size, ptr,
1145     num_events_in_wait_list, event_wait_list, event))
1146
1147 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
1148     (cl_command_queue command_queue,
1149     cl_mem buffer,
1150     cl_bool blocking_read,
1151     const size_t * buffer_offset,
1152     const size_t * host_offset,
1153     const size_t * region,
1154     size_t buffer_row_pitch,
1155     size_t buffer_slice_pitch,
1156     size_t host_row_pitch,
1157     size_t host_slice_pitch,
1158     void * ptr,
1159     cl_uint num_events_in_wait_list,
1160     const cl_event * event_wait_list,
1161     cl_event * event),
1162     (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
1163     buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
1164     event_wait_list, event))
1165
1166 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
1167     (cl_command_queue command_queue,
1168     cl_mem buffer,
1169     cl_bool blocking_write,
1170     size_t offset,
1171     size_t size,
1172     const void * ptr,
1173     cl_uint num_events_in_wait_list,
1174     const cl_event * event_wait_list,
1175     cl_event * event),
1176     (command_queue, buffer, blocking_write, offset, size, ptr,
1177     num_events_in_wait_list, event_wait_list, event))
1178
1179 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
1180     (cl_command_queue command_queue,
1181     cl_mem buffer,
1182     cl_bool blocking_write,
1183     const size_t * buffer_offset,
1184     const size_t * host_offset,
1185     const size_t * region,
1186     size_t buffer_row_pitch,
1187     size_t buffer_slice_pitch,
1188     size_t host_row_pitch,
1189     size_t host_slice_pitch,
1190     const void * ptr,
1191     cl_uint num_events_in_wait_list,
1192     const cl_event * event_wait_list,
1193     cl_event * event),
1194     (command_queue, buffer, blocking_write, buffer_offset, host_offset,
1195     region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
1196     host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
1197
1198 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
1199     (cl_command_queue command_queue,
1200     cl_mem buffer,
1201     const void * pattern,
1202     size_t pattern_size,
1203     size_t offset,
1204     size_t size,
1205     cl_uint num_events_in_wait_list,
1206     const cl_event * event_wait_list,
1207     cl_event * event),
1208     (command_queue, buffer, pattern, pattern_size, offset, size,
1209     num_events_in_wait_list, event_wait_list, event))*/
1210
1211 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
1212     (cl_command_queue command_queue,
1213     cl_mem src_buffer,
1214     cl_mem dst_buffer,
1215     size_t src_offset,
1216     size_t dst_offset,
1217     size_t size,
1218     cl_uint num_events_in_wait_list,
1219     const cl_event * event_wait_list,
1220     cl_event * event),
1221     (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
1222     size, num_events_in_wait_list, event_wait_list, event))
1223
1224 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
1225     (cl_command_queue command_queue,
1226     cl_mem src_buffer,
1227     cl_mem dst_buffer,
1228     const size_t * src_origin,
1229     const size_t * dst_origin,
1230     const size_t * region,
1231     size_t src_row_pitch,
1232     size_t src_slice_pitch,
1233     size_t dst_row_pitch,
1234     size_t dst_slice_pitch,
1235     cl_uint num_events_in_wait_list,
1236     const cl_event * event_wait_list,
1237     cl_event * event),
1238     (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
1239     region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
1240     num_events_in_wait_list, event_wait_list, event))
1241
1242 OCL_FUNC_P(void*, clEnqueueMapBuffer,
1243     (cl_command_queue command_queue,
1244     cl_mem buffer,
1245     cl_bool blocking_map,
1246     cl_map_flags map_flags,
1247     size_t offset,
1248     size_t size,
1249     cl_uint num_events_in_wait_list,
1250     const cl_event * event_wait_list,
1251     cl_event * event,
1252     cl_int * errcode_ret),
1253     (command_queue, buffer, blocking_map, map_flags, offset, size,
1254     num_events_in_wait_list, event_wait_list, event, errcode_ret))
1255
1256 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
1257     (cl_command_queue command_queue,
1258     cl_mem memobj,
1259     void * mapped_ptr,
1260     cl_uint num_events_in_wait_list,
1261     const cl_event * event_wait_list,
1262     cl_event * event),
1263     (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
1264
1265 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
1266     (cl_command_queue command_queue,
1267     cl_kernel kernel,
1268     cl_uint work_dim,
1269     const size_t * global_work_offset,
1270     const size_t * global_work_size,
1271     const size_t * local_work_size,
1272     cl_uint num_events_in_wait_list,
1273     const cl_event * event_wait_list,
1274     cl_event * event),
1275     (command_queue, kernel, work_dim, global_work_offset, global_work_size,
1276     local_work_size, num_events_in_wait_list, event_wait_list, event))
1277
1278 OCL_FUNC(cl_int, clEnqueueTask,
1279     (cl_command_queue command_queue,
1280     cl_kernel kernel,
1281     cl_uint num_events_in_wait_list,
1282     const cl_event * event_wait_list,
1283     cl_event * event),
1284     (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
1285
1286 OCL_FUNC(cl_int, clSetEventCallback,
1287     (cl_event event,
1288     cl_int command_exec_callback_type ,
1289     void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
1290     void *user_data),
1291     (event, command_exec_callback_type, pfn_event_notify, user_data))
1292
1293 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
1294
1295 }
1296
1297 #endif
1298
1299 #ifndef CL_VERSION_1_2
1300 #define CL_VERSION_1_2
1301 #endif
1302
1303 #endif
1304
1305 #ifdef _DEBUG
1306 #define CV_OclDbgAssert CV_DbgAssert
1307 #else
1308 #define CV_OclDbgAssert(expr) (void)(expr)
1309 #endif
1310
1311 namespace cv { namespace ocl {
1312
1313 struct UMat2D
1314 {
1315     UMat2D(const UMat& m)
1316     {
1317         offset = (int)m.offset;
1318         step = (int)m.step;
1319         rows = m.rows;
1320         cols = m.cols;
1321     }
1322     int offset;
1323     int step;
1324     int rows;
1325     int cols;
1326 };
1327
1328 struct UMat3D
1329 {
1330     UMat3D(const UMat& m)
1331     {
1332         offset = (int)m.offset;
1333         step = (int)m.step.p[1];
1334         slicestep = (int)m.step.p[0];
1335         slices = (int)m.size.p[0];
1336         rows = m.size.p[1];
1337         cols = m.size.p[2];
1338     }
1339     int offset;
1340     int slicestep;
1341     int step;
1342     int slices;
1343     int rows;
1344     int cols;
1345 };
1346
1347 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
1348 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
1349 {
1350     static uint64 table[256];
1351     static bool initialized = false;
1352
1353     if( !initialized )
1354     {
1355         for( int i = 0; i < 256; i++ )
1356         {
1357             uint64 c = i;
1358             for( int j = 0; j < 8; j++ )
1359                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
1360             table[i] = c;
1361         }
1362         initialized = true;
1363     }
1364
1365     uint64 crc = ~crc0;
1366     for( size_t idx = 0; idx < size; idx++ )
1367         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
1368
1369     return ~crc;
1370 }
1371
1372 struct HashKey
1373 {
1374     typedef uint64 part;
1375     HashKey(part _a, part _b) : a(_a), b(_b) {}
1376     part a, b;
1377 };
1378
1379 inline bool operator == (const HashKey& h1, const HashKey& h2)
1380 {
1381     return h1.a == h2.a && h1.b == h2.b;
1382 }
1383
1384 inline bool operator < (const HashKey& h1, const HashKey& h2)
1385 {
1386     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
1387 }
1388
1389
1390 bool haveOpenCL()
1391 {
1392 #ifdef HAVE_OPENCL
1393     static bool g_isOpenCLInitialized = false;
1394     static bool g_isOpenCLAvailable = false;
1395
1396     if (!g_isOpenCLInitialized)
1397     {
1398         try
1399         {
1400             cl_uint n = 0;
1401             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
1402         }
1403         catch (...)
1404         {
1405             g_isOpenCLAvailable = false;
1406         }
1407         g_isOpenCLInitialized = true;
1408     }
1409     return g_isOpenCLAvailable;
1410 #else
1411     return false;
1412 #endif
1413 }
1414
1415 bool useOpenCL()
1416 {
1417     CoreTLSData* data = coreTlsData.get();
1418     if( data->useOpenCL < 0 )
1419     {
1420         try
1421         {
1422             data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() != NULL;
1423         }
1424         catch (...)
1425         {
1426             data->useOpenCL = 0;
1427         }
1428     }
1429     return data->useOpenCL > 0;
1430 }
1431
1432 void setUseOpenCL(bool flag)
1433 {
1434     if( haveOpenCL() )
1435     {
1436         CoreTLSData* data = coreTlsData.get();
1437         data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
1438     }
1439 }
1440
1441 #ifdef HAVE_CLAMDBLAS
1442
1443 class AmdBlasHelper
1444 {
1445 public:
1446     static AmdBlasHelper & getInstance()
1447     {
1448         static AmdBlasHelper amdBlas;
1449         return amdBlas;
1450     }
1451
1452     bool isAvailable() const
1453     {
1454         return g_isAmdBlasAvailable;
1455     }
1456
1457     ~AmdBlasHelper()
1458     {
1459         try
1460         {
1461             clAmdBlasTeardown();
1462         }
1463         catch (...) { }
1464     }
1465
1466 protected:
1467     AmdBlasHelper()
1468     {
1469         if (!g_isAmdBlasInitialized)
1470         {
1471             AutoLock lock(m);
1472
1473             if (!g_isAmdBlasInitialized && haveOpenCL())
1474             {
1475                 try
1476                 {
1477                     g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
1478                 }
1479                 catch (...)
1480                 {
1481                     g_isAmdBlasAvailable = false;
1482                 }
1483             }
1484             else
1485                 g_isAmdBlasAvailable = false;
1486
1487             g_isAmdBlasInitialized = true;
1488         }
1489     }
1490
1491 private:
1492     static Mutex m;
1493     static bool g_isAmdBlasInitialized;
1494     static bool g_isAmdBlasAvailable;
1495 };
1496
1497 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
1498 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
1499 Mutex AmdBlasHelper::m;
1500
1501 bool haveAmdBlas()
1502 {
1503     return AmdBlasHelper::getInstance().isAvailable();
1504 }
1505
1506 #else
1507
1508 bool haveAmdBlas()
1509 {
1510     return false;
1511 }
1512
1513 #endif
1514
1515 #ifdef HAVE_CLAMDFFT
1516
1517 class AmdFftHelper
1518 {
1519 public:
1520     static AmdFftHelper & getInstance()
1521     {
1522         static AmdFftHelper amdFft;
1523         return amdFft;
1524     }
1525
1526     bool isAvailable() const
1527     {
1528         return g_isAmdFftAvailable;
1529     }
1530
1531     ~AmdFftHelper()
1532     {
1533         try
1534         {
1535 //            clAmdFftTeardown();
1536         }
1537         catch (...) { }
1538     }
1539
1540 protected:
1541     AmdFftHelper()
1542     {
1543         if (!g_isAmdFftInitialized)
1544         {
1545             AutoLock lock(m);
1546
1547             if (!g_isAmdFftInitialized && haveOpenCL())
1548             {
1549                 try
1550                 {
1551                     CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
1552                     g_isAmdFftAvailable = true;
1553                 }
1554                 catch (const Exception &)
1555                 {
1556                     g_isAmdFftAvailable = false;
1557                 }
1558             }
1559             else
1560                 g_isAmdFftAvailable = false;
1561
1562             g_isAmdFftInitialized = true;
1563         }
1564     }
1565
1566 private:
1567     static clAmdFftSetupData setupData;
1568     static Mutex m;
1569     static bool g_isAmdFftInitialized;
1570     static bool g_isAmdFftAvailable;
1571 };
1572
1573 clAmdFftSetupData AmdFftHelper::setupData;
1574 bool AmdFftHelper::g_isAmdFftAvailable = false;
1575 bool AmdFftHelper::g_isAmdFftInitialized = false;
1576 Mutex AmdFftHelper::m;
1577
1578 bool haveAmdFft()
1579 {
1580     return AmdFftHelper::getInstance().isAvailable();
1581 }
1582
1583 #else
1584
1585 bool haveAmdFft()
1586 {
1587     return false;
1588 }
1589
1590 #endif
1591
1592 void finish()
1593 {
1594     Queue::getDefault().finish();
1595 }
1596
1597 #define IMPLEMENT_REFCOUNTABLE() \
1598     void addref() { CV_XADD(&refcount, 1); } \
1599     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
1600     int refcount
1601
1602 /////////////////////////////////////////// Platform /////////////////////////////////////////////
1603
1604 struct Platform::Impl
1605 {
1606     Impl()
1607     {
1608         refcount = 1;
1609         handle = 0;
1610         initialized = false;
1611     }
1612
1613     ~Impl() {}
1614
1615     void init()
1616     {
1617         if( !initialized )
1618         {
1619             //cl_uint num_entries
1620             cl_uint n = 0;
1621             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
1622                 handle = 0;
1623             if( handle != 0 )
1624             {
1625                 char buf[1000];
1626                 size_t len = 0;
1627                 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
1628                 buf[len] = '\0';
1629                 vendor = String(buf);
1630             }
1631
1632             initialized = true;
1633         }
1634     }
1635
1636     IMPLEMENT_REFCOUNTABLE();
1637
1638     cl_platform_id handle;
1639     String vendor;
1640     bool initialized;
1641 };
1642
1643 Platform::Platform()
1644 {
1645     p = 0;
1646 }
1647
1648 Platform::~Platform()
1649 {
1650     if(p)
1651         p->release();
1652 }
1653
1654 Platform::Platform(const Platform& pl)
1655 {
1656     p = (Impl*)pl.p;
1657     if(p)
1658         p->addref();
1659 }
1660
1661 Platform& Platform::operator = (const Platform& pl)
1662 {
1663     Impl* newp = (Impl*)pl.p;
1664     if(newp)
1665         newp->addref();
1666     if(p)
1667         p->release();
1668     p = newp;
1669     return *this;
1670 }
1671
1672 void* Platform::ptr() const
1673 {
1674     return p ? p->handle : 0;
1675 }
1676
1677 Platform& Platform::getDefault()
1678 {
1679     static Platform p;
1680     if( !p.p )
1681     {
1682         p.p = new Impl;
1683         p.p->init();
1684     }
1685     return p;
1686 }
1687
1688 /////////////////////////////////////// Device ////////////////////////////////////////////
1689
1690 // deviceVersion has format
1691 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
1692 // by specification
1693 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
1694 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
1695 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
1696 {
1697     major = minor = 0;
1698     if (10 >= deviceVersion.length())
1699         return;
1700     const char *pstr = deviceVersion.c_str();
1701     if (0 != strncmp(pstr, "OpenCL ", 7))
1702         return;
1703     size_t ppos = deviceVersion.find('.', 7);
1704     if (String::npos == ppos)
1705         return;
1706     String temp = deviceVersion.substr(7, ppos - 7);
1707     major = atoi(temp.c_str());
1708     temp = deviceVersion.substr(ppos + 1);
1709     minor = atoi(temp.c_str());
1710 }
1711
1712 struct Device::Impl
1713 {
1714     Impl(void* d)
1715     {
1716         handle = (cl_device_id)d;
1717         refcount = 1;
1718
1719         name_ = getStrProp(CL_DEVICE_NAME);
1720         version_ = getStrProp(CL_DEVICE_VERSION);
1721         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
1722         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
1723         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
1724         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
1725         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
1726         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
1727
1728         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
1729         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
1730
1731         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
1732         if (vendorName_ == "Advanced Micro Devices, Inc." ||
1733             vendorName_ == "AMD")
1734             vendorID_ = VENDOR_AMD;
1735         else if (vendorName_ == "Intel(R) Corporation")
1736             vendorID_ = VENDOR_INTEL;
1737         else if (vendorName_ == "NVIDIA Corporation")
1738             vendorID_ = VENDOR_NVIDIA;
1739         else
1740             vendorID_ = UNKNOWN_VENDOR;
1741     }
1742
1743     template<typename _TpCL, typename _TpOut>
1744     _TpOut getProp(cl_device_info prop) const
1745     {
1746         _TpCL temp=_TpCL();
1747         size_t sz = 0;
1748
1749         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1750             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
1751     }
1752
1753     bool getBoolProp(cl_device_info prop) const
1754     {
1755         cl_bool temp = CL_FALSE;
1756         size_t sz = 0;
1757
1758         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
1759             sz == sizeof(temp) ? temp != 0 : false;
1760     }
1761
1762     String getStrProp(cl_device_info prop) const
1763     {
1764         char buf[1024];
1765         size_t sz=0;
1766         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
1767             sz < sizeof(buf) ? String(buf) : String();
1768     }
1769
1770     IMPLEMENT_REFCOUNTABLE();
1771     cl_device_id handle;
1772
1773     String name_;
1774     String version_;
1775     int doubleFPConfig_;
1776     bool hostUnifiedMemory_;
1777     int maxComputeUnits_;
1778     size_t maxWorkGroupSize_;
1779     int type_;
1780     int deviceVersionMajor_;
1781     int deviceVersionMinor_;
1782     String driverVersion_;
1783     String vendorName_;
1784     int vendorID_;
1785 };
1786
1787
1788 Device::Device()
1789 {
1790     p = 0;
1791 }
1792
1793 Device::Device(void* d)
1794 {
1795     p = 0;
1796     set(d);
1797 }
1798
1799 Device::Device(const Device& d)
1800 {
1801     p = d.p;
1802     if(p)
1803         p->addref();
1804 }
1805
1806 Device& Device::operator = (const Device& d)
1807 {
1808     Impl* newp = (Impl*)d.p;
1809     if(newp)
1810         newp->addref();
1811     if(p)
1812         p->release();
1813     p = newp;
1814     return *this;
1815 }
1816
1817 Device::~Device()
1818 {
1819     if(p)
1820         p->release();
1821 }
1822
1823 void Device::set(void* d)
1824 {
1825     if(p)
1826         p->release();
1827     p = new Impl(d);
1828 }
1829
1830 void* Device::ptr() const
1831 {
1832     return p ? p->handle : 0;
1833 }
1834
1835 String Device::name() const
1836 { return p ? p->name_ : String(); }
1837
1838 String Device::extensions() const
1839 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1840
1841 String Device::version() const
1842 { return p ? p->version_ : String(); }
1843
1844 String Device::vendorName() const
1845 { return p ? p->vendorName_ : String(); }
1846
1847 int Device::vendorID() const
1848 { return p ? p->vendorID_ : 0; }
1849
1850 String Device::OpenCL_C_Version() const
1851 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
1852
1853 String Device::OpenCLVersion() const
1854 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
1855
1856 int Device::deviceVersionMajor() const
1857 { return p ? p->deviceVersionMajor_ : 0; }
1858
1859 int Device::deviceVersionMinor() const
1860 { return p ? p->deviceVersionMinor_ : 0; }
1861
1862 String Device::driverVersion() const
1863 { return p ? p->driverVersion_ : String(); }
1864
1865 int Device::type() const
1866 { return p ? p->type_ : 0; }
1867
1868 int Device::addressBits() const
1869 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
1870
1871 bool Device::available() const
1872 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
1873
1874 bool Device::compilerAvailable() const
1875 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
1876
1877 bool Device::linkerAvailable() const
1878 #ifdef CL_VERSION_1_2
1879 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
1880 #else
1881 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1882 #endif
1883
1884 int Device::doubleFPConfig() const
1885 { return p ? p->doubleFPConfig_ : 0; }
1886
1887 int Device::singleFPConfig() const
1888 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
1889
1890 int Device::halfFPConfig() const
1891 #ifdef CL_VERSION_1_2
1892 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
1893 #else
1894 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1895 #endif
1896
1897 bool Device::endianLittle() const
1898 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
1899
1900 bool Device::errorCorrectionSupport() const
1901 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
1902
1903 int Device::executionCapabilities() const
1904 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
1905
1906 size_t Device::globalMemCacheSize() const
1907 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
1908
1909 int Device::globalMemCacheType() const
1910 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
1911
1912 int Device::globalMemCacheLineSize() const
1913 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
1914
1915 size_t Device::globalMemSize() const
1916 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
1917
1918 size_t Device::localMemSize() const
1919 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
1920
1921 int Device::localMemType() const
1922 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
1923
1924 bool Device::hostUnifiedMemory() const
1925 { return p ? p->hostUnifiedMemory_ : false; }
1926
1927 bool Device::imageSupport() const
1928 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
1929
1930 bool Device::imageFromBufferSupport() const
1931 {
1932     bool ret = false;
1933     if (p)
1934     {
1935         size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
1936         if (pos != String::npos)
1937         {
1938             ret = true;
1939         }
1940     }
1941     return ret;
1942 }
1943
1944 uint Device::imagePitchAlignment() const
1945 {
1946 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
1947     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
1948 #else
1949     return 0;
1950 #endif
1951 }
1952
1953 uint Device::imageBaseAddressAlignment() const
1954 {
1955 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
1956     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
1957 #else
1958     return 0;
1959 #endif
1960 }
1961
1962 size_t Device::image2DMaxWidth() const
1963 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
1964
1965 size_t Device::image2DMaxHeight() const
1966 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
1967
1968 size_t Device::image3DMaxWidth() const
1969 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
1970
1971 size_t Device::image3DMaxHeight() const
1972 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
1973
1974 size_t Device::image3DMaxDepth() const
1975 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
1976
1977 size_t Device::imageMaxBufferSize() const
1978 #ifdef CL_VERSION_1_2
1979 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
1980 #else
1981 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1982 #endif
1983
1984 size_t Device::imageMaxArraySize() const
1985 #ifdef CL_VERSION_1_2
1986 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
1987 #else
1988 { CV_REQUIRE_OPENCL_1_2_ERROR; }
1989 #endif
1990
1991 int Device::maxClockFrequency() const
1992 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
1993
1994 int Device::maxComputeUnits() const
1995 { return p ? p->maxComputeUnits_ : 0; }
1996
1997 int Device::maxConstantArgs() const
1998 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
1999
2000 size_t Device::maxConstantBufferSize() const
2001 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
2002
2003 size_t Device::maxMemAllocSize() const
2004 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
2005
2006 size_t Device::maxParameterSize() const
2007 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
2008
2009 int Device::maxReadImageArgs() const
2010 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
2011
2012 int Device::maxWriteImageArgs() const
2013 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
2014
2015 int Device::maxSamplers() const
2016 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
2017
2018 size_t Device::maxWorkGroupSize() const
2019 { return p ? p->maxWorkGroupSize_ : 0; }
2020
2021 int Device::maxWorkItemDims() const
2022 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
2023
2024 void Device::maxWorkItemSizes(size_t* sizes) const
2025 {
2026     if(p)
2027     {
2028         const int MAX_DIMS = 32;
2029         size_t retsz = 0;
2030         CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
2031                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
2032     }
2033 }
2034
2035 int Device::memBaseAddrAlign() const
2036 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
2037
2038 int Device::nativeVectorWidthChar() const
2039 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
2040
2041 int Device::nativeVectorWidthShort() const
2042 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
2043
2044 int Device::nativeVectorWidthInt() const
2045 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
2046
2047 int Device::nativeVectorWidthLong() const
2048 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
2049
2050 int Device::nativeVectorWidthFloat() const
2051 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
2052
2053 int Device::nativeVectorWidthDouble() const
2054 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
2055
2056 int Device::nativeVectorWidthHalf() const
2057 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
2058
2059 int Device::preferredVectorWidthChar() const
2060 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
2061
2062 int Device::preferredVectorWidthShort() const
2063 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
2064
2065 int Device::preferredVectorWidthInt() const
2066 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
2067
2068 int Device::preferredVectorWidthLong() const
2069 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
2070
2071 int Device::preferredVectorWidthFloat() const
2072 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
2073
2074 int Device::preferredVectorWidthDouble() const
2075 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
2076
2077 int Device::preferredVectorWidthHalf() const
2078 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
2079
2080 size_t Device::printfBufferSize() const
2081 #ifdef CL_VERSION_1_2
2082 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
2083 #else
2084 { CV_REQUIRE_OPENCL_1_2_ERROR; }
2085 #endif
2086
2087
2088 size_t Device::profilingTimerResolution() const
2089 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
2090
2091 const Device& Device::getDefault()
2092 {
2093     const Context& ctx = Context::getDefault();
2094     int idx = coreTlsData.get()->device;
2095     return ctx.device(idx);
2096 }
2097
2098 ////////////////////////////////////// Context ///////////////////////////////////////////////////
2099
2100 template <typename Functor, typename ObjectType>
2101 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
2102 {
2103     ::size_t required;
2104     cl_int err = f(obj, name, 0, NULL, &required);
2105     if (err != CL_SUCCESS)
2106         return err;
2107
2108     param.clear();
2109     if (required > 0)
2110     {
2111         AutoBuffer<char> buf(required + 1);
2112         char* ptr = (char*)buf; // cleanup is not needed
2113         err = f(obj, name, required, ptr, NULL);
2114         if (err != CL_SUCCESS)
2115             return err;
2116         param = ptr;
2117     }
2118
2119     return CL_SUCCESS;
2120 }
2121
2122 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
2123 {
2124     elems.clear();
2125     if (s.size() == 0)
2126         return;
2127     std::istringstream ss(s);
2128     std::string item;
2129     while (!ss.eof())
2130     {
2131         std::getline(ss, item, delim);
2132         elems.push_back(item);
2133     }
2134 }
2135
2136 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
2137 // Sample: AMD:GPU:
2138 // Sample: AMD:GPU:Tahiti
2139 // Sample: :GPU|CPU: = '' = ':' = '::'
2140 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
2141         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
2142 {
2143     std::vector<std::string> parts;
2144     split(configurationStr, ':', parts);
2145     if (parts.size() > 3)
2146     {
2147         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
2148         return false;
2149     }
2150     if (parts.size() > 2)
2151         deviceNameOrID = parts[2];
2152     if (parts.size() > 1)
2153     {
2154         split(parts[1], '|', deviceTypes);
2155     }
2156     if (parts.size() > 0)
2157     {
2158         platform = parts[0];
2159     }
2160     return true;
2161 }
2162
2163 #ifdef HAVE_WINRT
2164 static cl_device_id selectOpenCLDevice()
2165 {
2166     return NULL;
2167 }
2168 #else
2169 static cl_device_id selectOpenCLDevice()
2170 {
2171     std::string platform, deviceName;
2172     std::vector<std::string> deviceTypes;
2173
2174     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
2175     if (configuration && !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName))
2176         return NULL;
2177
2178     bool isID = false;
2179     int deviceID = -1;
2180     if (deviceName.length() == 1)
2181     // We limit ID range to 0..9, because we want to write:
2182     // - '2500' to mean i5-2500
2183     // - '8350' to mean AMD FX-8350
2184     // - '650' to mean GeForce 650
2185     // To extend ID range change condition to '> 0'
2186     {
2187         isID = true;
2188         for (size_t i = 0; i < deviceName.length(); i++)
2189         {
2190             if (!isdigit(deviceName[i]))
2191             {
2192                 isID = false;
2193                 break;
2194             }
2195         }
2196         if (isID)
2197         {
2198             deviceID = atoi(deviceName.c_str());
2199             if (deviceID < 0)
2200                 return NULL;
2201         }
2202     }
2203
2204     std::vector<cl_platform_id> platforms;
2205     {
2206         cl_uint numPlatforms = 0;
2207         CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
2208
2209         if (numPlatforms == 0)
2210             return NULL;
2211         platforms.resize((size_t)numPlatforms);
2212         CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
2213         platforms.resize(numPlatforms);
2214     }
2215
2216     int selectedPlatform = -1;
2217     if (platform.length() > 0)
2218     {
2219         for (size_t i = 0; i < platforms.size(); i++)
2220         {
2221             std::string name;
2222             CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
2223             if (name.find(platform) != std::string::npos)
2224             {
2225                 selectedPlatform = (int)i;
2226                 break;
2227             }
2228         }
2229         if (selectedPlatform == -1)
2230         {
2231             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
2232             goto not_found;
2233         }
2234     }
2235     if (deviceTypes.size() == 0)
2236     {
2237         if (!isID)
2238         {
2239             deviceTypes.push_back("GPU");
2240             if (configuration)
2241                 deviceTypes.push_back("CPU");
2242         }
2243         else
2244             deviceTypes.push_back("ALL");
2245     }
2246     for (size_t t = 0; t < deviceTypes.size(); t++)
2247     {
2248         int deviceType = 0;
2249         std::string tempStrDeviceType = deviceTypes[t];
2250         std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
2251
2252         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2253             deviceType = Device::TYPE_GPU;
2254         else if (tempStrDeviceType == "cpu")
2255             deviceType = Device::TYPE_CPU;
2256         else if (tempStrDeviceType == "accelerator")
2257             deviceType = Device::TYPE_ACCELERATOR;
2258         else if (tempStrDeviceType == "all")
2259             deviceType = Device::TYPE_ALL;
2260         else
2261         {
2262             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
2263             goto not_found;
2264         }
2265
2266         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
2267         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
2268                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
2269                 i++)
2270         {
2271             cl_uint count = 0;
2272             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
2273             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2274             if (count == 0)
2275                 continue;
2276             size_t base = devices.size();
2277             devices.resize(base + count);
2278             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
2279             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
2280         }
2281
2282         for (size_t i = (isID ? deviceID : 0);
2283              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
2284              i++)
2285         {
2286             std::string name;
2287             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
2288             cl_bool useGPU = true;
2289             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
2290             {
2291                 cl_bool isIGPU = CL_FALSE;
2292                 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
2293                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
2294             }
2295             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
2296             {
2297                 // TODO check for OpenCL 1.1
2298                 return devices[i];
2299             }
2300         }
2301     }
2302
2303 not_found:
2304     std::cerr << "ERROR: Required OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
2305             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
2306             << "    Device types: ";
2307     for (size_t t = 0; t < deviceTypes.size(); t++)
2308         std::cerr << deviceTypes[t] << " ";
2309
2310     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
2311     CV_Error(CL_INVALID_DEVICE, "Requested OpenCL device is not found");
2312     return NULL;
2313 }
2314 #endif
2315
2316 struct Context::Impl
2317 {
2318     Impl()
2319     {
2320         refcount = 1;
2321         handle = 0;
2322     }
2323
2324     void setDefault()
2325     {
2326         CV_Assert(handle == NULL);
2327
2328         cl_device_id d = selectOpenCLDevice();
2329
2330         if (d == NULL)
2331             return;
2332
2333         cl_platform_id pl = NULL;
2334         CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
2335
2336         cl_context_properties prop[] =
2337         {
2338             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2339             0
2340         };
2341
2342         // !!! in the current implementation force the number of devices to 1 !!!
2343         cl_uint nd = 1;
2344         cl_int status;
2345
2346         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
2347
2348         bool ok = handle != 0 && status == CL_SUCCESS;
2349         if( ok )
2350         {
2351             devices.resize(nd);
2352             devices[0].set(d);
2353         }
2354         else
2355             handle = NULL;
2356     }
2357
2358     Impl(int dtype0)
2359     {
2360         refcount = 1;
2361         handle = 0;
2362
2363         cl_int retval = 0;
2364         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
2365         cl_context_properties prop[] =
2366         {
2367             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
2368             0
2369         };
2370
2371         cl_uint i, nd0 = 0, nd = 0;
2372         int dtype = dtype0 & 15;
2373         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
2374
2375         AutoBuffer<void*> dlistbuf(nd0*2+1);
2376         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
2377         cl_device_id* dlist_new = dlist + nd0;
2378         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
2379         String name0;
2380
2381         for(i = 0; i < nd0; i++)
2382         {
2383             Device d(dlist[i]);
2384             if( !d.available() || !d.compilerAvailable() )
2385                 continue;
2386             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
2387                 continue;
2388             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
2389                 continue;
2390             String name = d.name();
2391             if( nd != 0 && name != name0 )
2392                 continue;
2393             name0 = name;
2394             dlist_new[nd++] = dlist[i];
2395         }
2396
2397         if(nd == 0)
2398             return;
2399
2400         // !!! in the current implementation force the number of devices to 1 !!!
2401         nd = 1;
2402
2403         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
2404         bool ok = handle != 0 && retval == CL_SUCCESS;
2405         if( ok )
2406         {
2407             devices.resize(nd);
2408             for( i = 0; i < nd; i++ )
2409                 devices[i].set(dlist_new[i]);
2410         }
2411     }
2412
2413     ~Impl()
2414     {
2415         if(handle)
2416         {
2417             clReleaseContext(handle);
2418             handle = NULL;
2419         }
2420         devices.clear();
2421     }
2422
2423     Program getProg(const ProgramSource& src,
2424                     const String& buildflags, String& errmsg)
2425     {
2426         String prefix = Program::getPrefix(buildflags);
2427         HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
2428         phash_t::iterator it = phash.find(k);
2429         if( it != phash.end() )
2430             return it->second;
2431         //String filename = format("%08x%08x_%08x%08x.clb2",
2432         Program prog(src, buildflags, errmsg);
2433         if(prog.ptr())
2434             phash.insert(std::pair<HashKey,Program>(k, prog));
2435         return prog;
2436     }
2437
2438     IMPLEMENT_REFCOUNTABLE();
2439
2440     cl_context handle;
2441     std::vector<Device> devices;
2442
2443     typedef ProgramSource::hash_t hash_t;
2444
2445     struct HashKey
2446     {
2447         HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
2448         bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
2449         bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
2450         bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
2451         hash_t a, b;
2452     };
2453     typedef std::map<HashKey, Program> phash_t;
2454     phash_t phash;
2455 };
2456
2457
2458 Context::Context()
2459 {
2460     p = 0;
2461 }
2462
2463 Context::Context(int dtype)
2464 {
2465     p = 0;
2466     create(dtype);
2467 }
2468
2469 bool Context::create()
2470 {
2471     if( !haveOpenCL() )
2472         return false;
2473     if(p)
2474         p->release();
2475     p = new Impl();
2476     if(!p->handle)
2477     {
2478         delete p;
2479         p = 0;
2480     }
2481     return p != 0;
2482 }
2483
2484 bool Context::create(int dtype0)
2485 {
2486     if( !haveOpenCL() )
2487         return false;
2488     if(p)
2489         p->release();
2490     p = new Impl(dtype0);
2491     if(!p->handle)
2492     {
2493         delete p;
2494         p = 0;
2495     }
2496     return p != 0;
2497 }
2498
2499 Context::~Context()
2500 {
2501     if (p)
2502     {
2503         p->release();
2504         p = NULL;
2505     }
2506 }
2507
2508 Context::Context(const Context& c)
2509 {
2510     p = (Impl*)c.p;
2511     if(p)
2512         p->addref();
2513 }
2514
2515 Context& Context::operator = (const Context& c)
2516 {
2517     Impl* newp = (Impl*)c.p;
2518     if(newp)
2519         newp->addref();
2520     if(p)
2521         p->release();
2522     p = newp;
2523     return *this;
2524 }
2525
2526 void* Context::ptr() const
2527 {
2528     return p == NULL ? NULL : p->handle;
2529 }
2530
2531 size_t Context::ndevices() const
2532 {
2533     return p ? p->devices.size() : 0;
2534 }
2535
2536 const Device& Context::device(size_t idx) const
2537 {
2538     static Device dummy;
2539     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
2540 }
2541
2542 Context& Context::getDefault(bool initialize)
2543 {
2544     static Context* ctx = new Context();
2545     if(!ctx->p && haveOpenCL())
2546     {
2547         if (!ctx->p)
2548             ctx->p = new Impl();
2549         if (initialize)
2550         {
2551             // do not create new Context right away.
2552             // First, try to retrieve existing context of the same type.
2553             // In its turn, Platform::getContext() may call Context::create()
2554             // if there is no such context.
2555             if (ctx->p->handle == NULL)
2556                 ctx->p->setDefault();
2557         }
2558     }
2559
2560     return *ctx;
2561 }
2562
2563 Program Context::getProg(const ProgramSource& prog,
2564                          const String& buildopts, String& errmsg)
2565 {
2566     return p ? p->getProg(prog, buildopts, errmsg) : Program();
2567 }
2568
2569 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
2570 {
2571     cl_context context = (cl_context)_context;
2572     cl_device_id device = (cl_device_id)_device;
2573
2574     // cleanup old context
2575     Context::Impl * impl = ctx.p;
2576     if (impl->handle)
2577     {
2578         CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
2579     }
2580     impl->devices.clear();
2581
2582     impl->handle = context;
2583     impl->devices.resize(1);
2584     impl->devices[0].set(device);
2585
2586     Platform& p = Platform::getDefault();
2587     Platform::Impl* pImpl = p.p;
2588     pImpl->handle = (cl_platform_id)platform;
2589 }
2590
2591 /////////////////////////////////////////// Queue /////////////////////////////////////////////
2592
2593 struct Queue::Impl
2594 {
2595     Impl(const Context& c, const Device& d)
2596     {
2597         refcount = 1;
2598         const Context* pc = &c;
2599         cl_context ch = (cl_context)pc->ptr();
2600         if( !ch )
2601         {
2602             pc = &Context::getDefault();
2603             ch = (cl_context)pc->ptr();
2604         }
2605         cl_device_id dh = (cl_device_id)d.ptr();
2606         if( !dh )
2607             dh = (cl_device_id)pc->device(0).ptr();
2608         cl_int retval = 0;
2609         handle = clCreateCommandQueue(ch, dh, 0, &retval);
2610         CV_OclDbgAssert(retval == CL_SUCCESS);
2611     }
2612
2613     ~Impl()
2614     {
2615 #ifdef _WIN32
2616         if (!cv::__termination)
2617 #endif
2618         {
2619             if(handle)
2620             {
2621                 clFinish(handle);
2622                 clReleaseCommandQueue(handle);
2623                 handle = NULL;
2624             }
2625         }
2626     }
2627
2628     IMPLEMENT_REFCOUNTABLE();
2629
2630     cl_command_queue handle;
2631 };
2632
2633 Queue::Queue()
2634 {
2635     p = 0;
2636 }
2637
2638 Queue::Queue(const Context& c, const Device& d)
2639 {
2640     p = 0;
2641     create(c, d);
2642 }
2643
2644 Queue::Queue(const Queue& q)
2645 {
2646     p = q.p;
2647     if(p)
2648         p->addref();
2649 }
2650
2651 Queue& Queue::operator = (const Queue& q)
2652 {
2653     Impl* newp = (Impl*)q.p;
2654     if(newp)
2655         newp->addref();
2656     if(p)
2657         p->release();
2658     p = newp;
2659     return *this;
2660 }
2661
2662 Queue::~Queue()
2663 {
2664     if(p)
2665         p->release();
2666 }
2667
2668 bool Queue::create(const Context& c, const Device& d)
2669 {
2670     if(p)
2671         p->release();
2672     p = new Impl(c, d);
2673     return p->handle != 0;
2674 }
2675
2676 void Queue::finish()
2677 {
2678     if(p && p->handle)
2679     {
2680         CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
2681     }
2682 }
2683
2684 void* Queue::ptr() const
2685 {
2686     return p ? p->handle : 0;
2687 }
2688
2689 Queue& Queue::getDefault()
2690 {
2691     Queue& q = coreTlsData.get()->oclQueue;
2692     if( !q.p && haveOpenCL() )
2693         q.create(Context::getDefault());
2694     return q;
2695 }
2696
2697 static cl_command_queue getQueue(const Queue& q)
2698 {
2699     cl_command_queue qq = (cl_command_queue)q.ptr();
2700     if(!qq)
2701         qq = (cl_command_queue)Queue::getDefault().ptr();
2702     return qq;
2703 }
2704
2705 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
2706
2707 KernelArg::KernelArg()
2708     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
2709 {
2710 }
2711
2712 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
2713     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
2714 {
2715 }
2716
2717 KernelArg KernelArg::Constant(const Mat& m)
2718 {
2719     CV_Assert(m.isContinuous());
2720     return KernelArg(CONSTANT, 0, 0, 0, m.data, m.total()*m.elemSize());
2721 }
2722
2723 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
2724
2725 struct Kernel::Impl
2726 {
2727     Impl(const char* kname, const Program& prog) :
2728         refcount(1), e(0), nu(0)
2729     {
2730         cl_program ph = (cl_program)prog.ptr();
2731         cl_int retval = 0;
2732         handle = ph != 0 ?
2733             clCreateKernel(ph, kname, &retval) : 0;
2734         CV_OclDbgAssert(retval == CL_SUCCESS);
2735         for( int i = 0; i < MAX_ARRS; i++ )
2736             u[i] = 0;
2737         haveTempDstUMats = false;
2738     }
2739
2740     void cleanupUMats()
2741     {
2742         for( int i = 0; i < MAX_ARRS; i++ )
2743             if( u[i] )
2744             {
2745                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
2746                     u[i]->currAllocator->deallocate(u[i]);
2747                 u[i] = 0;
2748             }
2749         nu = 0;
2750         haveTempDstUMats = false;
2751     }
2752
2753     void addUMat(const UMat& m, bool dst)
2754     {
2755         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
2756         u[nu] = m.u;
2757         CV_XADD(&m.u->urefcount, 1);
2758         nu++;
2759         if(dst && m.u->tempUMat())
2760             haveTempDstUMats = true;
2761     }
2762
2763     void addImage(const Image2D& image)
2764     {
2765         images.push_back(image);
2766     }
2767
2768     void finit()
2769     {
2770         cleanupUMats();
2771         images.clear();
2772         if(e) { clReleaseEvent(e); e = 0; }
2773         release();
2774     }
2775
2776     ~Impl()
2777     {
2778         if(handle)
2779             clReleaseKernel(handle);
2780     }
2781
2782     IMPLEMENT_REFCOUNTABLE();
2783
2784     cl_kernel handle;
2785     cl_event e;
2786     enum { MAX_ARRS = 16 };
2787     UMatData* u[MAX_ARRS];
2788     int nu;
2789     std::list<Image2D> images;
2790     bool haveTempDstUMats;
2791 };
2792
2793 }}
2794
2795 extern "C"
2796 {
2797 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
2798 {
2799     ((cv::ocl::Kernel::Impl*)p)->finit();
2800 }
2801
2802 }
2803
2804 namespace cv { namespace ocl {
2805
2806 Kernel::Kernel()
2807 {
2808     p = 0;
2809 }
2810
2811 Kernel::Kernel(const char* kname, const Program& prog)
2812 {
2813     p = 0;
2814     create(kname, prog);
2815 }
2816
2817 Kernel::Kernel(const char* kname, const ProgramSource& src,
2818                const String& buildopts, String* errmsg)
2819 {
2820     p = 0;
2821     create(kname, src, buildopts, errmsg);
2822 }
2823
2824 Kernel::Kernel(const Kernel& k)
2825 {
2826     p = k.p;
2827     if(p)
2828         p->addref();
2829 }
2830
2831 Kernel& Kernel::operator = (const Kernel& k)
2832 {
2833     Impl* newp = (Impl*)k.p;
2834     if(newp)
2835         newp->addref();
2836     if(p)
2837         p->release();
2838     p = newp;
2839     return *this;
2840 }
2841
2842 Kernel::~Kernel()
2843 {
2844     if(p)
2845         p->release();
2846 }
2847
2848 bool Kernel::create(const char* kname, const Program& prog)
2849 {
2850     if(p)
2851         p->release();
2852     p = new Impl(kname, prog);
2853     if(p->handle == 0)
2854     {
2855         p->release();
2856         p = 0;
2857     }
2858     return p != 0;
2859 }
2860
2861 bool Kernel::create(const char* kname, const ProgramSource& src,
2862                     const String& buildopts, String* errmsg)
2863 {
2864     if(p)
2865     {
2866         p->release();
2867         p = 0;
2868     }
2869     String tempmsg;
2870     if( !errmsg ) errmsg = &tempmsg;
2871     const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
2872     return create(kname, prog);
2873 }
2874
2875 void* Kernel::ptr() const
2876 {
2877     return p ? p->handle : 0;
2878 }
2879
2880 bool Kernel::empty() const
2881 {
2882     return ptr() == 0;
2883 }
2884
2885 int Kernel::set(int i, const void* value, size_t sz)
2886 {
2887     if (!p || !p->handle)
2888         return -1;
2889     if (i < 0)
2890         return i;
2891     if( i == 0 )
2892         p->cleanupUMats();
2893
2894     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
2895     CV_OclDbgAssert(retval == CL_SUCCESS);
2896     if (retval != CL_SUCCESS)
2897         return -1;
2898     return i+1;
2899 }
2900
2901 int Kernel::set(int i, const Image2D& image2D)
2902 {
2903     p->addImage(image2D);
2904     cl_mem h = (cl_mem)image2D.ptr();
2905     return set(i, &h, sizeof(h));
2906 }
2907
2908 int Kernel::set(int i, const UMat& m)
2909 {
2910     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
2911 }
2912
2913 int Kernel::set(int i, const KernelArg& arg)
2914 {
2915     if( !p || !p->handle )
2916         return -1;
2917     if (i < 0)
2918         return i;
2919     if( i == 0 )
2920         p->cleanupUMats();
2921     if( arg.m )
2922     {
2923         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
2924                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
2925         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
2926         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
2927
2928         if (!h)
2929         {
2930             p->release();
2931             p = 0;
2932             return -1;
2933         }
2934
2935         if (ptronly)
2936             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i++, sizeof(h), &h) == CL_SUCCESS);
2937         else if( arg.m->dims <= 2 )
2938         {
2939             UMat2D u2d(*arg.m);
2940             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2941             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
2942             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
2943             i += 3;
2944
2945             if( !(arg.flags & KernelArg::NO_SIZE) )
2946             {
2947                 int cols = u2d.cols*arg.wscale/arg.iwscale;
2948                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
2949                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
2950                 i += 2;
2951             }
2952         }
2953         else
2954         {
2955             UMat3D u3d(*arg.m);
2956             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
2957             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
2958             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
2959             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
2960             i += 4;
2961             if( !(arg.flags & KernelArg::NO_SIZE) )
2962             {
2963                 int cols = u3d.cols*arg.wscale/arg.iwscale;
2964                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
2965                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
2966                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
2967                 i += 3;
2968             }
2969         }
2970         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
2971         return i;
2972     }
2973     CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
2974     return i+1;
2975 }
2976
2977
2978 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
2979                  bool sync, const Queue& q)
2980 {
2981     if(!p || !p->handle || p->e != 0)
2982         return false;
2983
2984     cl_command_queue qq = getQueue(q);
2985     size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
2986     size_t total = 1;
2987     CV_Assert(_globalsize != 0);
2988     for (int i = 0; i < dims; i++)
2989     {
2990         size_t val = _localsize ? _localsize[i] :
2991             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
2992         CV_Assert( val > 0 );
2993         total *= _globalsize[i];
2994         globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
2995     }
2996     if( total == 0 )
2997         return true;
2998     if( p->haveTempDstUMats )
2999         sync = true;
3000     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
3001                                            offset, globalsize, _localsize, 0, 0,
3002                                            sync ? 0 : &p->e);
3003     if( sync || retval != CL_SUCCESS )
3004     {
3005         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3006         p->cleanupUMats();
3007     }
3008     else
3009     {
3010         p->addref();
3011         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3012     }
3013     return retval == CL_SUCCESS;
3014 }
3015
3016 bool Kernel::runTask(bool sync, const Queue& q)
3017 {
3018     if(!p || !p->handle || p->e != 0)
3019         return false;
3020
3021     cl_command_queue qq = getQueue(q);
3022     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
3023     if( sync || retval != CL_SUCCESS )
3024     {
3025         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
3026         p->cleanupUMats();
3027     }
3028     else
3029     {
3030         p->addref();
3031         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
3032     }
3033     return retval == CL_SUCCESS;
3034 }
3035
3036
3037 size_t Kernel::workGroupSize() const
3038 {
3039     if(!p || !p->handle)
3040         return 0;
3041     size_t val = 0, retsz = 0;
3042     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3043     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
3044                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3045 }
3046
3047 size_t Kernel::preferedWorkGroupSizeMultiple() const
3048 {
3049     if(!p || !p->handle)
3050         return 0;
3051     size_t val = 0, retsz = 0;
3052     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3053     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
3054                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
3055 }
3056
3057 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
3058 {
3059     if(!p || !p->handle || !wsz)
3060         return 0;
3061     size_t retsz = 0;
3062     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3063     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
3064                                     sizeof(wsz[0]*3), wsz, &retsz) == CL_SUCCESS;
3065 }
3066
3067 size_t Kernel::localMemSize() const
3068 {
3069     if(!p || !p->handle)
3070         return 0;
3071     size_t retsz = 0;
3072     cl_ulong val = 0;
3073     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
3074     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
3075                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
3076 }
3077
3078 /////////////////////////////////////////// Program /////////////////////////////////////////////
3079
3080 struct Program::Impl
3081 {
3082     Impl(const ProgramSource& _src,
3083          const String& _buildflags, String& errmsg)
3084     {
3085         refcount = 1;
3086         const Context& ctx = Context::getDefault();
3087         src = _src;
3088         buildflags = _buildflags;
3089         const String& srcstr = src.source();
3090         const char* srcptr = srcstr.c_str();
3091         size_t srclen = srcstr.size();
3092         cl_int retval = 0;
3093
3094         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
3095         if( handle && retval == CL_SUCCESS )
3096         {
3097             int i, n = (int)ctx.ndevices();
3098             AutoBuffer<void*> deviceListBuf(n+1);
3099             void** deviceList = deviceListBuf;
3100             for( i = 0; i < n; i++ )
3101                 deviceList[i] = ctx.device(i).ptr();
3102
3103             Device device = Device::getDefault();
3104             if (device.isAMD())
3105                 buildflags += " -D AMD_DEVICE";
3106             else if (device.isIntel())
3107                 buildflags += " -D INTEL_DEVICE";
3108
3109             retval = clBuildProgram(handle, n,
3110                                     (const cl_device_id*)deviceList,
3111                                     buildflags.c_str(), 0, 0);
3112 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
3113             if( retval != CL_SUCCESS )
3114 #endif
3115             {
3116                 size_t retsz = 0;
3117                 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3118                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
3119                 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
3120                 {
3121                     AutoBuffer<char> bufbuf(retsz + 16);
3122                     char* buf = bufbuf;
3123                     buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
3124                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
3125                     if (buildInfo_retval == CL_SUCCESS)
3126                     {
3127                         // TODO It is useful to see kernel name & program file name also
3128                         errmsg = String(buf);
3129                         printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
3130                         fflush(stdout);
3131                     }
3132                 }
3133                 if (retval != CL_SUCCESS && handle)
3134                 {
3135                     clReleaseProgram(handle);
3136                     handle = NULL;
3137                 }
3138             }
3139         }
3140     }
3141
3142     Impl(const String& _buf, const String& _buildflags)
3143     {
3144         refcount = 1;
3145         handle = 0;
3146         buildflags = _buildflags;
3147         if(_buf.empty())
3148             return;
3149         String prefix0 = Program::getPrefix(buildflags);
3150         const Context& ctx = Context::getDefault();
3151         const Device& dev = Device::getDefault();
3152         const char* pos0 = _buf.c_str();
3153         const char* pos1 = strchr(pos0, '\n');
3154         if(!pos1)
3155             return;
3156         const char* pos2 = strchr(pos1+1, '\n');
3157         if(!pos2)
3158             return;
3159         const char* pos3 = strchr(pos2+1, '\n');
3160         if(!pos3)
3161             return;
3162         size_t prefixlen = (pos3 - pos0)+1;
3163         String prefix(pos0, prefixlen);
3164         if( prefix != prefix0 )
3165             return;
3166         const uchar* bin = (uchar*)(pos3+1);
3167         void* devid = dev.ptr();
3168         size_t codelen = _buf.length() - prefixlen;
3169         cl_int binstatus = 0, retval = 0;
3170         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
3171                                            &codelen, &bin, &binstatus, &retval);
3172         CV_OclDbgAssert(retval == CL_SUCCESS);
3173     }
3174
3175     String store()
3176     {
3177         if(!handle)
3178             return String();
3179         size_t progsz = 0, retsz = 0;
3180         String prefix = Program::getPrefix(buildflags);
3181         size_t prefixlen = prefix.length();
3182         if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
3183             return String();
3184         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
3185         uchar* buf = bufbuf;
3186         memcpy(buf, prefix.c_str(), prefixlen);
3187         buf += prefixlen;
3188         if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
3189             return String();
3190         buf[progsz] = (uchar)'\0';
3191         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
3192     }
3193
3194     ~Impl()
3195     {
3196         if( handle )
3197         {
3198 #ifdef _WIN32
3199             if (!cv::__termination)
3200 #endif
3201             {
3202                 clReleaseProgram(handle);
3203             }
3204             handle = NULL;
3205         }
3206     }
3207
3208     IMPLEMENT_REFCOUNTABLE();
3209
3210     ProgramSource src;
3211     String buildflags;
3212     cl_program handle;
3213 };
3214
3215
3216 Program::Program() { p = 0; }
3217
3218 Program::Program(const ProgramSource& src,
3219         const String& buildflags, String& errmsg)
3220 {
3221     p = 0;
3222     create(src, buildflags, errmsg);
3223 }
3224
3225 Program::Program(const Program& prog)
3226 {
3227     p = prog.p;
3228     if(p)
3229         p->addref();
3230 }
3231
3232 Program& Program::operator = (const Program& prog)
3233 {
3234     Impl* newp = (Impl*)prog.p;
3235     if(newp)
3236         newp->addref();
3237     if(p)
3238         p->release();
3239     p = newp;
3240     return *this;
3241 }
3242
3243 Program::~Program()
3244 {
3245     if(p)
3246         p->release();
3247 }
3248
3249 bool Program::create(const ProgramSource& src,
3250             const String& buildflags, String& errmsg)
3251 {
3252     if(p)
3253         p->release();
3254     p = new Impl(src, buildflags, errmsg);
3255     if(!p->handle)
3256     {
3257         p->release();
3258         p = 0;
3259     }
3260     return p != 0;
3261 }
3262
3263 const ProgramSource& Program::source() const
3264 {
3265     static ProgramSource dummy;
3266     return p ? p->src : dummy;
3267 }
3268
3269 void* Program::ptr() const
3270 {
3271     return p ? p->handle : 0;
3272 }
3273
3274 bool Program::read(const String& bin, const String& buildflags)
3275 {
3276     if(p)
3277         p->release();
3278     p = new Impl(bin, buildflags);
3279     return p->handle != 0;
3280 }
3281
3282 bool Program::write(String& bin) const
3283 {
3284     if(!p)
3285         return false;
3286     bin = p->store();
3287     return !bin.empty();
3288 }
3289
3290 String Program::getPrefix() const
3291 {
3292     if(!p)
3293         return String();
3294     return getPrefix(p->buildflags);
3295 }
3296
3297 String Program::getPrefix(const String& buildflags)
3298 {
3299     const Context& ctx = Context::getDefault();
3300     const Device& dev = ctx.device(0);
3301     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
3302                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
3303 }
3304
3305 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
3306
3307 struct ProgramSource::Impl
3308 {
3309     Impl(const char* _src)
3310     {
3311         init(String(_src));
3312     }
3313     Impl(const String& _src)
3314     {
3315         init(_src);
3316     }
3317     void init(const String& _src)
3318     {
3319         refcount = 1;
3320         src = _src;
3321         h = crc64((uchar*)src.c_str(), src.size());
3322     }
3323
3324     IMPLEMENT_REFCOUNTABLE();
3325     String src;
3326     ProgramSource::hash_t h;
3327 };
3328
3329
3330 ProgramSource::ProgramSource()
3331 {
3332     p = 0;
3333 }
3334
3335 ProgramSource::ProgramSource(const char* prog)
3336 {
3337     p = new Impl(prog);
3338 }
3339
3340 ProgramSource::ProgramSource(const String& prog)
3341 {
3342     p = new Impl(prog);
3343 }
3344
3345 ProgramSource::~ProgramSource()
3346 {
3347     if(p)
3348         p->release();
3349 }
3350
3351 ProgramSource::ProgramSource(const ProgramSource& prog)
3352 {
3353     p = prog.p;
3354     if(p)
3355         p->addref();
3356 }
3357
3358 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
3359 {
3360     Impl* newp = (Impl*)prog.p;
3361     if(newp)
3362         newp->addref();
3363     if(p)
3364         p->release();
3365     p = newp;
3366     return *this;
3367 }
3368
3369 const String& ProgramSource::source() const
3370 {
3371     static String dummy;
3372     return p ? p->src : dummy;
3373 }
3374
3375 ProgramSource::hash_t ProgramSource::hash() const
3376 {
3377     return p ? p->h : 0;
3378 }
3379
3380 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
3381
3382 class OpenCLBufferPool
3383 {
3384 protected:
3385     ~OpenCLBufferPool() { }
3386 public:
3387     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity) = 0;
3388     virtual void release(cl_mem handle, size_t capacity) = 0;
3389 };
3390
3391 class OpenCLBufferPoolImpl : public BufferPoolController, public OpenCLBufferPool
3392 {
3393 public:
3394     struct BufferEntry
3395     {
3396         cl_mem clBuffer_;
3397         size_t capacity_;
3398     };
3399 protected:
3400     Mutex mutex_;
3401
3402     size_t currentReservedSize;
3403     size_t maxReservedSize;
3404
3405     std::list<BufferEntry> reservedEntries_; // LRU order
3406
3407     // synchronized
3408     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
3409     {
3410         if (reservedEntries_.empty())
3411             return false;
3412         std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3413         std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
3414         BufferEntry result = {NULL, 0};
3415         size_t minDiff = (size_t)(-1);
3416         for (; i != reservedEntries_.end(); ++i)
3417         {
3418             BufferEntry& e = *i;
3419             if (e.capacity_ >= size)
3420             {
3421                 size_t diff = e.capacity_ - size;
3422                 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
3423                 {
3424                     minDiff = diff;
3425                     result_pos = i;
3426                     result = e;
3427                     if (diff == 0)
3428                         break;
3429                 }
3430             }
3431         }
3432         if (result_pos != reservedEntries_.end())
3433         {
3434             //CV_DbgAssert(result == *result_pos);
3435             reservedEntries_.erase(result_pos);
3436             entry = result;
3437             currentReservedSize -= entry.capacity_;
3438             return true;
3439         }
3440         return false;
3441     }
3442
3443     // synchronized
3444     void _checkSizeOfReservedEntries()
3445     {
3446         while (currentReservedSize > maxReservedSize)
3447         {
3448             CV_DbgAssert(!reservedEntries_.empty());
3449             const BufferEntry& entry = reservedEntries_.back();
3450             CV_DbgAssert(currentReservedSize >= entry.capacity_);
3451             currentReservedSize -= entry.capacity_;
3452             _releaseBufferEntry(entry);
3453             reservedEntries_.pop_back();
3454         }
3455     }
3456
3457     inline size_t _allocationGranularity(size_t size)
3458     {
3459         // heuristic values
3460         if (size < 1024)
3461             return 16;
3462         else if (size < 64*1024)
3463             return 64;
3464         else if (size < 1024*1024)
3465             return 4096;
3466         else if (size < 16*1024*1024)
3467             return 64*1024;
3468         else
3469             return 1024*1024;
3470     }
3471
3472     void _allocateBufferEntry(BufferEntry& entry, size_t size)
3473     {
3474         CV_DbgAssert(entry.clBuffer_ == NULL);
3475         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
3476         Context& ctx = Context::getDefault();
3477         cl_int retval = CL_SUCCESS;
3478         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE, entry.capacity_, 0, &retval);
3479         CV_Assert(retval == CL_SUCCESS);
3480         CV_Assert(entry.clBuffer_ != NULL);
3481         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
3482                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
3483     }
3484
3485     void _releaseBufferEntry(const BufferEntry& entry)
3486     {
3487         CV_Assert(entry.capacity_ != 0);
3488         CV_Assert(entry.clBuffer_ != NULL);
3489         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
3490                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
3491         clReleaseMemObject(entry.clBuffer_);
3492     }
3493 public:
3494     OpenCLBufferPoolImpl()
3495         : currentReservedSize(0), maxReservedSize(0)
3496     {
3497         int poolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
3498         maxReservedSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", poolSize);
3499     }
3500     virtual ~OpenCLBufferPoolImpl()
3501     {
3502         freeAllReservedBuffers();
3503         CV_Assert(reservedEntries_.empty());
3504     }
3505 public:
3506     virtual cl_mem allocate(size_t size, CV_OUT size_t& capacity)
3507     {
3508         BufferEntry entry = {NULL, 0};
3509         if (maxReservedSize > 0)
3510         {
3511             AutoLock locker(mutex_);
3512             if (_findAndRemoveEntryFromReservedList(entry, size))
3513             {
3514                 CV_DbgAssert(size <= entry.capacity_);
3515                 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
3516                 capacity = entry.capacity_;
3517                 return entry.clBuffer_;
3518             }
3519         }
3520         _allocateBufferEntry(entry, size);
3521         capacity = entry.capacity_;
3522         return entry.clBuffer_;
3523     }
3524     virtual void release(cl_mem handle, size_t capacity)
3525     {
3526         BufferEntry entry = {handle, capacity};
3527         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
3528         {
3529             _releaseBufferEntry(entry);
3530         }
3531         else
3532         {
3533             AutoLock locker(mutex_);
3534             reservedEntries_.push_front(entry);
3535             currentReservedSize += entry.capacity_;
3536             _checkSizeOfReservedEntries();
3537         }
3538     }
3539
3540     virtual size_t getReservedSize() const { return currentReservedSize; }
3541     virtual size_t getMaxReservedSize() const { return maxReservedSize; }
3542     virtual void setMaxReservedSize(size_t size)
3543     {
3544         AutoLock locker(mutex_);
3545         size_t oldMaxReservedSize = maxReservedSize;
3546         maxReservedSize = size;
3547         if (maxReservedSize < oldMaxReservedSize)
3548         {
3549             std::list<BufferEntry>::iterator i = reservedEntries_.begin();
3550             for (; i != reservedEntries_.end();)
3551             {
3552                 const BufferEntry& entry = *i;
3553                 if (entry.capacity_ > maxReservedSize / 8)
3554                 {
3555                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
3556                     currentReservedSize -= entry.capacity_;
3557                     _releaseBufferEntry(entry);
3558                     i = reservedEntries_.erase(i);
3559                     continue;
3560                 }
3561                 ++i;
3562             }
3563             _checkSizeOfReservedEntries();
3564         }
3565     }
3566     virtual void freeAllReservedBuffers()
3567     {
3568         AutoLock locker(mutex_);
3569         std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
3570         for (; i != reservedEntries_.end(); ++i)
3571         {
3572             const BufferEntry& entry = *i;
3573             _releaseBufferEntry(entry);
3574         }
3575         reservedEntries_.clear();
3576     }
3577 };
3578
3579 #if defined _MSC_VER
3580 #pragma warning(disable:4127) // conditional expression is constant
3581 #endif
3582 template <bool readAccess, bool writeAccess>
3583 class AlignedDataPtr
3584 {
3585 protected:
3586     const size_t size_;
3587     uchar* const originPtr_;
3588     const size_t alignment_;
3589     uchar* ptr_;
3590     uchar* allocatedPtr_;
3591
3592 public:
3593     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
3594         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
3595     {
3596         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
3597         if (((size_t)ptr_ & (alignment - 1)) != 0)
3598         {
3599             allocatedPtr_ = new uchar[size_ + alignment - 1];
3600             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
3601             if (readAccess)
3602             {
3603                 memcpy(ptr_, originPtr_, size_);
3604             }
3605         }
3606     }
3607
3608     uchar* getAlignedPtr() const
3609     {
3610         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
3611         return ptr_;
3612     }
3613
3614     ~AlignedDataPtr()
3615     {
3616         if (allocatedPtr_)
3617         {
3618             if (writeAccess)
3619             {
3620                 memcpy(originPtr_, ptr_, size_);
3621             }
3622             delete[] allocatedPtr_;
3623             allocatedPtr_ = NULL;
3624         }
3625         ptr_ = NULL;
3626     }
3627 private:
3628     AlignedDataPtr(const AlignedDataPtr&); // disabled
3629     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
3630 };
3631 #if defined _MSC_VER
3632 #pragma warning(default:4127) // conditional expression is constant
3633 #endif
3634
3635 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
3636 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
3637 #endif
3638
3639 class OpenCLAllocator : public MatAllocator
3640 {
3641     mutable OpenCLBufferPoolImpl bufferPool;
3642     enum AllocatorFlags
3643     {
3644         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0
3645     };
3646 public:
3647     OpenCLAllocator() { matStdAllocator = Mat::getStdAllocator(); }
3648
3649     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
3650             int flags, UMatUsageFlags usageFlags) const
3651     {
3652         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
3653         return u;
3654     }
3655
3656     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
3657     {
3658         const Device& dev = ctx.device(0);
3659         createFlags = 0;
3660         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
3661             createFlags |= CL_MEM_ALLOC_HOST_PTR;
3662
3663         if( dev.hostUnifiedMemory() )
3664             flags0 = 0;
3665         else
3666             flags0 = UMatData::COPY_ON_MAP;
3667     }
3668
3669     UMatData* allocate(int dims, const int* sizes, int type,
3670                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
3671     {
3672         if(!useOpenCL())
3673             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3674         CV_Assert(data == 0);
3675         size_t total = CV_ELEM_SIZE(type);
3676         for( int i = dims-1; i >= 0; i-- )
3677         {
3678             if( step )
3679                 step[i] = total;
3680             total *= sizes[i];
3681         }
3682
3683         Context& ctx = Context::getDefault();
3684         int createFlags = 0, flags0 = 0;
3685         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
3686
3687         size_t capacity = 0;
3688         void* handle = NULL;
3689         int allocatorFlags = 0;
3690         if (createFlags == 0)
3691         {
3692             handle = bufferPool.allocate(total, capacity);
3693             if (!handle)
3694                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3695             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
3696         }
3697         else
3698         {
3699             capacity = total;
3700             cl_int retval = 0;
3701             handle = clCreateBuffer((cl_context)ctx.ptr(),
3702                                           CL_MEM_READ_WRITE|createFlags, total, 0, &retval);
3703             if( !handle || retval != CL_SUCCESS )
3704                 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
3705         }
3706         UMatData* u = new UMatData(this);
3707         u->data = 0;
3708         u->size = total;
3709         u->capacity = capacity;
3710         u->handle = handle;
3711         u->flags = flags0;
3712         u->allocatorFlags_ = allocatorFlags;
3713         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
3714         return u;
3715     }
3716
3717     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
3718     {
3719         if(!u)
3720             return false;
3721
3722         UMatDataAutoLock lock(u);
3723
3724         if(u->handle == 0)
3725         {
3726             CV_Assert(u->origdata != 0);
3727             Context& ctx = Context::getDefault();
3728             int createFlags = 0, flags0 = 0;
3729             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
3730
3731             cl_context ctx_handle = (cl_context)ctx.ptr();
3732             cl_int retval = 0;
3733             int tempUMatFlags = UMatData::TEMP_UMAT;
3734             u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|CL_MEM_READ_WRITE,
3735                                        u->size, u->origdata, &retval);
3736             if((!u->handle || retval != CL_SUCCESS) && !(accessFlags & ACCESS_FAST))
3737             {
3738                 u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
3739                                            u->size, u->origdata, &retval);
3740                 tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
3741             }
3742             if(!u->handle || retval != CL_SUCCESS)
3743                 return false;
3744             u->prevAllocator = u->currAllocator;
3745             u->currAllocator = this;
3746             u->flags |= tempUMatFlags;
3747         }
3748         if(accessFlags & ACCESS_WRITE)
3749             u->markHostCopyObsolete(true);
3750         return true;
3751     }
3752
3753     /*void sync(UMatData* u) const
3754     {
3755         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3756         UMatDataAutoLock lock(u);
3757
3758         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
3759         {
3760             if( u->tempCopiedUMat() )
3761             {
3762                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3763                                     u->size, u->origdata, 0, 0, 0);
3764             }
3765             else
3766             {
3767                 cl_int retval = 0;
3768                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3769                                                 (CL_MAP_READ | CL_MAP_WRITE),
3770                                                 0, u->size, 0, 0, 0, &retval);
3771                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
3772                 clFinish(q);
3773             }
3774             u->markHostCopyObsolete(false);
3775         }
3776         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
3777         {
3778             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3779                                  u->size, u->data, 0, 0, 0);
3780         }
3781     }*/
3782
3783     void deallocate(UMatData* u) const
3784     {
3785         if(!u)
3786             return;
3787
3788         CV_Assert(u->urefcount >= 0);
3789         CV_Assert(u->refcount >= 0);
3790
3791         // TODO: !!! when we add Shared Virtual Memory Support,
3792         // this function (as well as the others) should be corrected
3793         CV_Assert(u->handle != 0 && u->urefcount == 0);
3794         if(u->tempUMat())
3795         {
3796 //            UMatDataAutoLock lock(u);
3797             if( u->hostCopyObsolete() && u->refcount > 0 )
3798             {
3799                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3800                 if( u->tempCopiedUMat() )
3801                 {
3802                     AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3803                     CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3804                                         u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
3805                 }
3806                 else
3807                 {
3808                     cl_int retval = 0;
3809                     void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3810                                                     (CL_MAP_READ | CL_MAP_WRITE),
3811                                                     0, u->size, 0, 0, 0, &retval);
3812                     CV_OclDbgAssert(retval == CL_SUCCESS);
3813                     CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
3814                     CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3815                 }
3816             }
3817             u->markHostCopyObsolete(false);
3818             clReleaseMemObject((cl_mem)u->handle);
3819             u->handle = 0;
3820             u->currAllocator = u->prevAllocator;
3821             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3822                 fastFree(u->data);
3823             u->data = u->origdata;
3824             if(u->refcount == 0)
3825                 u->currAllocator->deallocate(u);
3826         }
3827         else
3828         {
3829             CV_Assert(u->refcount == 0);
3830             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
3831             {
3832                 fastFree(u->data);
3833                 u->data = 0;
3834             }
3835             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
3836             {
3837                 bufferPool.release((cl_mem)u->handle, u->capacity);
3838             }
3839             else
3840             {
3841                 clReleaseMemObject((cl_mem)u->handle);
3842             }
3843             u->handle = 0;
3844             u->capacity = 0;
3845             delete u;
3846         }
3847     }
3848
3849     void map(UMatData* u, int accessFlags) const
3850     {
3851         if(!u)
3852             return;
3853
3854         CV_Assert( u->handle != 0 );
3855
3856         UMatDataAutoLock autolock(u);
3857
3858         if(accessFlags & ACCESS_WRITE)
3859             u->markDeviceCopyObsolete(true);
3860
3861         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3862
3863         // FIXIT Workaround for UMat synchronization issue
3864         // if( u->refcount == 0 )
3865         {
3866             if( !u->copyOnMap() )
3867             {
3868                 if (u->data) // FIXIT Workaround for UMat synchronization issue
3869                 {
3870                     //CV_Assert(u->hostCopyObsolete() == false);
3871                     return;
3872                 }
3873                 // because there can be other map requests for the same UMat with different access flags,
3874                 // we use the universal (read-write) access mode.
3875                 cl_int retval = 0;
3876                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
3877                                                      (CL_MAP_READ | CL_MAP_WRITE),
3878                                                      0, u->size, 0, 0, 0, &retval);
3879                 if(u->data && retval == CL_SUCCESS)
3880                 {
3881                     u->markHostCopyObsolete(false);
3882                     return;
3883                 }
3884
3885                 // if map failed, switch to copy-on-map mode for the particular buffer
3886                 u->flags |= UMatData::COPY_ON_MAP;
3887             }
3888
3889             if(!u->data)
3890             {
3891                 u->data = (uchar*)fastMalloc(u->size);
3892                 u->markHostCopyObsolete(true);
3893             }
3894         }
3895
3896         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
3897         {
3898             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3899             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3900                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
3901             u->markHostCopyObsolete(false);
3902         }
3903     }
3904
3905     void unmap(UMatData* u) const
3906     {
3907         if(!u)
3908             return;
3909
3910         CV_Assert(u->handle != 0);
3911
3912         UMatDataAutoLock autolock(u);
3913
3914         // FIXIT Workaround for UMat synchronization issue
3915         if(u->refcount > 0)
3916             return;
3917
3918         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
3919         cl_int retval = 0;
3920         if( !u->copyOnMap() && u->data )
3921         {
3922             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
3923                                 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
3924             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
3925             u->data = 0;
3926         }
3927         else if( u->copyOnMap() && u->deviceCopyObsolete() )
3928         {
3929             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
3930             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
3931                                 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
3932         }
3933         u->markDeviceCopyObsolete(false);
3934         u->markHostCopyObsolete(false);
3935     }
3936
3937     bool checkContinuous(int dims, const size_t sz[],
3938                          const size_t srcofs[], const size_t srcstep[],
3939                          const size_t dstofs[], const size_t dststep[],
3940                          size_t& total, size_t new_sz[],
3941                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
3942                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
3943     {
3944         bool iscontinuous = true;
3945         srcrawofs = srcofs ? srcofs[dims-1] : 0;
3946         dstrawofs = dstofs ? dstofs[dims-1] : 0;
3947         total = sz[dims-1];
3948         for( int i = dims-2; i >= 0; i-- )
3949         {
3950             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
3951                 iscontinuous = false;
3952             total *= sz[i];
3953             if( srcofs )
3954                 srcrawofs += srcofs[i]*srcstep[i];
3955             if( dstofs )
3956                 dstrawofs += dstofs[i]*dststep[i];
3957         }
3958
3959         if( !iscontinuous )
3960         {
3961             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
3962             if( dims == 2 )
3963             {
3964                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
3965                 // we assume that new_... arrays are initialized by caller
3966                 // with 0's, so there is no else branch
3967                 if( srcofs )
3968                 {
3969                     new_srcofs[0] = srcofs[1];
3970                     new_srcofs[1] = srcofs[0];
3971                     new_srcofs[2] = 0;
3972                 }
3973
3974                 if( dstofs )
3975                 {
3976                     new_dstofs[0] = dstofs[1];
3977                     new_dstofs[1] = dstofs[0];
3978                     new_dstofs[2] = 0;
3979                 }
3980
3981                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
3982                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
3983             }
3984             else
3985             {
3986                 // we could check for dims == 3 here,
3987                 // but from user perspective this one is more informative
3988                 CV_Assert(dims <= 3);
3989                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
3990                 if( srcofs )
3991                 {
3992                     new_srcofs[0] = srcofs[2];
3993                     new_srcofs[1] = srcofs[1];
3994                     new_srcofs[2] = srcofs[0];
3995                 }
3996
3997                 if( dstofs )
3998                 {
3999                     new_dstofs[0] = dstofs[2];
4000                     new_dstofs[1] = dstofs[1];
4001                     new_dstofs[2] = dstofs[0];
4002                 }
4003
4004                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
4005                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
4006             }
4007         }
4008         return iscontinuous;
4009     }
4010
4011     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
4012                   const size_t srcofs[], const size_t srcstep[],
4013                   const size_t dststep[]) const
4014     {
4015         if(!u)
4016             return;
4017         UMatDataAutoLock autolock(u);
4018
4019         if( u->data && !u->hostCopyObsolete() )
4020         {
4021             Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
4022             return;
4023         }
4024         CV_Assert( u->handle != 0 );
4025
4026         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4027
4028         size_t total = 0, new_sz[] = {0, 0, 0};
4029         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4030         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4031
4032         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
4033                                             total, new_sz,
4034                                             srcrawofs, new_srcofs, new_srcstep,
4035                                             dstrawofs, new_dstofs, new_dststep);
4036
4037         AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4038         if( iscontinuous )
4039         {
4040             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
4041                                            srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4042         }
4043         else
4044         {
4045             CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4046                             new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
4047                             new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
4048         }
4049     }
4050
4051     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
4052                 const size_t dstofs[], const size_t dststep[],
4053                 const size_t srcstep[]) const
4054     {
4055         if(!u)
4056             return;
4057
4058         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4059         CV_Assert(u->refcount == 0 || u->tempUMat());
4060
4061         size_t total = 0, new_sz[] = {0, 0, 0};
4062         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4063         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4064
4065         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
4066                                             total, new_sz,
4067                                             srcrawofs, new_srcofs, new_srcstep,
4068                                             dstrawofs, new_dstofs, new_dststep);
4069
4070         UMatDataAutoLock autolock(u);
4071
4072         // if there is cached CPU copy of the GPU matrix,
4073         // we could use it as a destination.
4074         // we can do it in 2 cases:
4075         //    1. we overwrite the whole content
4076         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
4077         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
4078         {
4079             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
4080             u->markHostCopyObsolete(false);
4081             u->markDeviceCopyObsolete(true);
4082             return;
4083         }
4084
4085         CV_Assert( u->handle != 0 );
4086         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4087
4088         AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
4089         if( iscontinuous )
4090         {
4091             CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
4092                 CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) == CL_SUCCESS );
4093         }
4094         else
4095         {
4096             CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
4097                 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
4098                 new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) == CL_SUCCESS );
4099         }
4100
4101         u->markHostCopyObsolete(true);
4102         u->markDeviceCopyObsolete(false);
4103     }
4104
4105     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
4106               const size_t srcofs[], const size_t srcstep[],
4107               const size_t dstofs[], const size_t dststep[], bool _sync) const
4108     {
4109         if(!src || !dst)
4110             return;
4111
4112         size_t total = 0, new_sz[] = {0, 0, 0};
4113         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
4114         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
4115
4116         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
4117                                             total, new_sz,
4118                                             srcrawofs, new_srcofs, new_srcstep,
4119                                             dstrawofs, new_dstofs, new_dststep);
4120
4121         UMatDataAutoLock src_autolock(src);
4122         UMatDataAutoLock dst_autolock(dst);
4123
4124         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
4125         {
4126             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
4127             return;
4128         }
4129         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
4130         {
4131             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
4132             dst->markHostCopyObsolete(false);
4133             dst->markDeviceCopyObsolete(true);
4134             return;
4135         }
4136
4137         // there should be no user-visible CPU copies of the UMat which we are going to copy to
4138         CV_Assert(dst->refcount == 0);
4139         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
4140
4141         if( iscontinuous )
4142         {
4143             CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4144                                            srcrawofs, dstrawofs, total, 0, 0, 0) == CL_SUCCESS );
4145         }
4146         else
4147         {
4148             cl_int retval;
4149             CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
4150                                                new_srcofs, new_dstofs, new_sz,
4151                                                new_srcstep[0], new_srcstep[1],
4152                                                new_dststep[0], new_dststep[1],
4153                                                0, 0, 0)) == CL_SUCCESS );
4154         }
4155
4156         dst->markHostCopyObsolete(true);
4157         dst->markDeviceCopyObsolete(false);
4158
4159         if( _sync )
4160         {
4161             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
4162         }
4163     }
4164
4165     BufferPoolController* getBufferPoolController() const { return &bufferPool; }
4166
4167     MatAllocator* matStdAllocator;
4168 };
4169
4170 MatAllocator* getOpenCLAllocator()
4171 {
4172     static MatAllocator * allocator = new OpenCLAllocator();
4173     return allocator;
4174 }
4175
4176 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
4177
4178 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
4179 {
4180     cl_uint numDevices = 0;
4181     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4182                                 0, NULL, &numDevices) == CL_SUCCESS);
4183
4184     if (numDevices == 0)
4185     {
4186         devices.clear();
4187         return;
4188     }
4189
4190     devices.resize((size_t)numDevices);
4191     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
4192                                 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
4193 }
4194
4195 struct PlatformInfo::Impl
4196 {
4197     Impl(void* id)
4198     {
4199         refcount = 1;
4200         handle = *(cl_platform_id*)id;
4201         getDevices(devices, handle);
4202     }
4203
4204     String getStrProp(cl_device_info prop) const
4205     {
4206         char buf[1024];
4207         size_t sz=0;
4208         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
4209             sz < sizeof(buf) ? String(buf) : String();
4210     }
4211
4212     IMPLEMENT_REFCOUNTABLE();
4213     std::vector<cl_device_id> devices;
4214     cl_platform_id handle;
4215 };
4216
4217 PlatformInfo::PlatformInfo()
4218 {
4219     p = 0;
4220 }
4221
4222 PlatformInfo::PlatformInfo(void* platform_id)
4223 {
4224     p = new Impl(platform_id);
4225 }
4226
4227 PlatformInfo::~PlatformInfo()
4228 {
4229     if(p)
4230         p->release();
4231 }
4232
4233 PlatformInfo::PlatformInfo(const PlatformInfo& i)
4234 {
4235     if (i.p)
4236         i.p->addref();
4237     p = i.p;
4238 }
4239
4240 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
4241 {
4242     if (i.p != p)
4243     {
4244         if (i.p)
4245             i.p->addref();
4246         if (p)
4247             p->release();
4248         p = i.p;
4249     }
4250     return *this;
4251 }
4252
4253 int PlatformInfo::deviceNumber() const
4254 {
4255     return p ? (int)p->devices.size() : 0;
4256 }
4257
4258 void PlatformInfo::getDevice(Device& device, int d) const
4259 {
4260     CV_Assert(p && d < (int)p->devices.size() );
4261     if(p)
4262         device.set(p->devices[d]);
4263 }
4264
4265 String PlatformInfo::name() const
4266 {
4267     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
4268 }
4269
4270 String PlatformInfo::vendor() const
4271 {
4272     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
4273 }
4274
4275 String PlatformInfo::version() const
4276 {
4277     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
4278 }
4279
4280 static void getPlatforms(std::vector<cl_platform_id>& platforms)
4281 {
4282     cl_uint numPlatforms = 0;
4283     CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
4284
4285     if (numPlatforms == 0)
4286     {
4287         platforms.clear();
4288         return;
4289     }
4290
4291     platforms.resize((size_t)numPlatforms);
4292     CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
4293 }
4294
4295 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
4296 {
4297     std::vector<cl_platform_id> platforms;
4298     getPlatforms(platforms);
4299
4300     for (size_t i = 0; i < platforms.size(); i++)
4301         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
4302 }
4303
4304 const char* typeToStr(int type)
4305 {
4306     static const char* tab[]=
4307     {
4308         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4309         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4310         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4311         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4312         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4313         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
4314         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
4315         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4316     };
4317     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4318     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4319 }
4320
4321 const char* memopTypeToStr(int type)
4322 {
4323     static const char* tab[] =
4324     {
4325         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
4326         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
4327         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
4328         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
4329         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4330         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
4331         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
4332         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
4333     };
4334     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
4335     return cn > 16 ? "?" : tab[depth*16 + cn-1];
4336 }
4337
4338 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
4339 {
4340     if( sdepth == ddepth )
4341         return "noconvert";
4342     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
4343     if( ddepth >= CV_32F ||
4344         (ddepth == CV_32S && sdepth < CV_32S) ||
4345         (ddepth == CV_16S && sdepth <= CV_8S) ||
4346         (ddepth == CV_16U && sdepth == CV_8U))
4347     {
4348         sprintf(buf, "convert_%s", typestr);
4349     }
4350     else if( sdepth >= CV_32F )
4351         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
4352     else
4353         sprintf(buf, "convert_%s_sat", typestr);
4354
4355     return buf;
4356 }
4357
4358 template <typename T>
4359 static std::string kerToStr(const Mat & k)
4360 {
4361     int width = k.cols - 1, depth = k.depth();
4362     const T * const data = reinterpret_cast<const T *>(k.data);
4363
4364     std::ostringstream stream;
4365     stream.precision(10);
4366
4367     if (depth <= CV_8S)
4368     {
4369         for (int i = 0; i < width; ++i)
4370             stream << "DIG(" << (int)data[i] << ")";
4371         stream << "DIG(" << (int)data[width] << ")";
4372     }
4373     else if (depth == CV_32F)
4374     {
4375         stream.setf(std::ios_base::showpoint);
4376         for (int i = 0; i < width; ++i)
4377             stream << "DIG(" << data[i] << "f)";
4378         stream << "DIG(" << data[width] << "f)";
4379     }
4380     else
4381     {
4382         for (int i = 0; i < width; ++i)
4383             stream << "DIG(" << data[i] << ")";
4384         stream << "DIG(" << data[width] << ")";
4385     }
4386
4387     return stream.str();
4388 }
4389
4390 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
4391 {
4392     Mat kernel = _kernel.getMat().reshape(1, 1);
4393
4394     int depth = kernel.depth();
4395     if (ddepth < 0)
4396         ddepth = depth;
4397
4398     if (ddepth != depth)
4399         kernel.convertTo(kernel, ddepth);
4400
4401     typedef std::string (* func_t)(const Mat &);
4402     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
4403                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
4404     const func_t func = funcs[ddepth];
4405     CV_Assert(func != 0);
4406
4407     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
4408 }
4409
4410 #define PROCESS_SRC(src) \
4411     do \
4412     { \
4413         if (!src.empty()) \
4414         { \
4415             CV_Assert(src.isMat() || src.isUMat()); \
4416             int ctype = src.type(), ccn = CV_MAT_CN(ctype); \
4417             Size csize = src.size(); \
4418             cols.push_back(ccn * csize.width); \
4419             if (ctype != type) \
4420                 return 1; \
4421             offsets.push_back(src.offset()); \
4422             steps.push_back(src.step()); \
4423         } \
4424     } \
4425     while ((void)0, 0)
4426
4427 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
4428                               InputArray src4, InputArray src5, InputArray src6,
4429                               InputArray src7, InputArray src8, InputArray src9)
4430 {
4431     int type = src1.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type), esz1 = CV_ELEM_SIZE1(depth);
4432     Size ssize = src1.size();
4433     const ocl::Device & d = ocl::Device::getDefault();
4434
4435     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
4436         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
4437         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
4438         d.preferredVectorWidthDouble(), -1 }, kercn = vectorWidths[depth];
4439
4440     // if the device says don't use vectors
4441     if (vectorWidths[0] == 1)
4442     {
4443         // it's heuristic
4444         int vectorWidthsOthers[] = { 16, 16, 8, 8, 1, 1, 1, -1 };
4445         kercn = vectorWidthsOthers[depth];
4446     }
4447
4448     if (ssize.width * cn < kercn || kercn <= 0)
4449         return 1;
4450
4451     std::vector<size_t> offsets, steps, cols;
4452     PROCESS_SRC(src1);
4453     PROCESS_SRC(src2);
4454     PROCESS_SRC(src3);
4455     PROCESS_SRC(src4);
4456     PROCESS_SRC(src5);
4457     PROCESS_SRC(src6);
4458     PROCESS_SRC(src7);
4459     PROCESS_SRC(src8);
4460     PROCESS_SRC(src9);
4461
4462     size_t size = offsets.size();
4463     int wsz = kercn * esz1;
4464     std::vector<int> dividers(size, wsz);
4465
4466     for (size_t i = 0; i < size; ++i)
4467         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % dividers[i] != 0)
4468             dividers[i] >>= 1;
4469
4470     // default strategy
4471     for (size_t i = 0; i < size; ++i)
4472         if (dividers[i] != wsz)
4473         {
4474             kercn = 1;
4475             break;
4476         }
4477
4478     // another strategy
4479 //    width = *std::min_element(dividers.begin(), dividers.end());
4480
4481     return kercn;
4482 }
4483
4484 #undef PROCESS_SRC
4485
4486
4487 // TODO Make this as a method of OpenCL "BuildOptions" class
4488 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
4489 {
4490     if (!buildOptions.empty())
4491         buildOptions += " ";
4492     int type = _m.type(), depth = CV_MAT_DEPTH(type);
4493     buildOptions += format(
4494             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
4495             name.c_str(), ocl::typeToStr(type),
4496             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
4497             name.c_str(), (int)CV_MAT_CN(type),
4498             name.c_str(), (int)CV_ELEM_SIZE(type),
4499             name.c_str(), (int)CV_ELEM_SIZE1(type),
4500             name.c_str(), (int)depth
4501             );
4502 }
4503
4504
4505 struct Image2D::Impl
4506 {
4507     Impl(const UMat &src, bool norm, bool alias)
4508     {
4509         handle = 0;
4510         refcount = 1;
4511         init(src, norm, alias);
4512     }
4513
4514     ~Impl()
4515     {
4516         if (handle)
4517             clReleaseMemObject(handle);
4518     }
4519
4520     static cl_image_format getImageFormat(int depth, int cn, bool norm)
4521     {
4522         cl_image_format format;
4523         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
4524                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
4525         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
4526                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
4527         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
4528
4529         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
4530         int channelOrder = channelOrders[cn];
4531         format.image_channel_data_type = (cl_channel_type)channelType;
4532         format.image_channel_order = (cl_channel_order)channelOrder;
4533         return format;
4534     }
4535
4536     static bool isFormatSupported(cl_image_format format)
4537     {
4538         cl_context context = (cl_context)Context::getDefault().ptr();
4539         // Figure out how many formats are supported by this context.
4540         cl_uint numFormats = 0;
4541         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4542                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
4543                                                 NULL, &numFormats);
4544         AutoBuffer<cl_image_format> formats(numFormats);
4545         err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
4546                                          CL_MEM_OBJECT_IMAGE2D, numFormats,
4547                                          formats, NULL);
4548         CV_OclDbgAssert(err == CL_SUCCESS);
4549         for (cl_uint i = 0; i < numFormats; ++i)
4550         {
4551             if (!memcmp(&formats[i], &format, sizeof(format)))
4552             {
4553                 return true;
4554             }
4555         }
4556         return false;
4557     }
4558
4559     void init(const UMat &src, bool norm, bool alias)
4560     {
4561         CV_Assert(ocl::Device::getDefault().imageSupport());
4562
4563         int err, depth = src.depth(), cn = src.channels();
4564         CV_Assert(cn <= 4);
4565         cl_image_format format = getImageFormat(depth, cn, norm);
4566
4567         if (!isFormatSupported(format))
4568             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
4569
4570         cl_context context = (cl_context)Context::getDefault().ptr();
4571         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
4572
4573 #ifdef CL_VERSION_1_2
4574         // this enables backwards portability to
4575         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
4576         const Device & d = ocl::Device::getDefault();
4577         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
4578         CV_Assert(!alias || canCreateAlias(src));
4579         if (1 < major || (1 == major && 2 <= minor))
4580         {
4581             cl_image_desc desc;
4582             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
4583             desc.image_width      = src.cols;
4584             desc.image_height     = src.rows;
4585             desc.image_depth      = 0;
4586             desc.image_array_size = 1;
4587             desc.image_row_pitch  = alias ? src.step[0] : 0;
4588             desc.image_slice_pitch = 0;
4589             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
4590             desc.num_mip_levels   = 0;
4591             desc.num_samples      = 0;
4592             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
4593         }
4594         else
4595 #endif
4596         {
4597             CV_SUPPRESS_DEPRECATED_START
4598             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
4599             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
4600             CV_SUPPRESS_DEPRECATED_END
4601         }
4602         CV_OclDbgAssert(err == CL_SUCCESS);
4603
4604         size_t origin[] = { 0, 0, 0 };
4605         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
4606
4607         cl_mem devData;
4608         if (!alias && !src.isContinuous())
4609         {
4610             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
4611             CV_OclDbgAssert(err == CL_SUCCESS);
4612
4613             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
4614             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
4615                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
4616             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4617         }
4618         else
4619         {
4620             devData = (cl_mem)src.handle(ACCESS_READ);
4621         }
4622         CV_Assert(devData != NULL);
4623
4624         if (!alias)
4625         {
4626             CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
4627             if (!src.isContinuous())
4628             {
4629                 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
4630                 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
4631             }
4632         }
4633     }
4634
4635     IMPLEMENT_REFCOUNTABLE();
4636
4637     cl_mem handle;
4638 };
4639
4640 Image2D::Image2D()
4641 {
4642     p = NULL;
4643 }
4644
4645 Image2D::Image2D(const UMat &src, bool norm, bool alias)
4646 {
4647     p = new Impl(src, norm, alias);
4648 }
4649
4650 bool Image2D::canCreateAlias(const UMat &m)
4651 {
4652     bool ret = false;
4653     const Device & d = ocl::Device::getDefault();
4654     if (d.imageFromBufferSupport())
4655     {
4656         // This is the required pitch alignment in pixels
4657         uint pitchAlign = d.imagePitchAlignment();
4658         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
4659         {
4660             // We don't currently handle the case where the buffer was created
4661             // with CL_MEM_USE_HOST_PTR
4662             if (!m.u->tempUMat())
4663             {
4664                 ret = true;
4665             }
4666         }
4667     }
4668     return ret;
4669 }
4670
4671 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
4672 {
4673     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
4674
4675     return Impl::isFormatSupported(format);
4676 }
4677
4678 Image2D::Image2D(const Image2D & i)
4679 {
4680     p = i.p;
4681     if (p)
4682         p->addref();
4683 }
4684
4685 Image2D & Image2D::operator = (const Image2D & i)
4686 {
4687     if (i.p != p)
4688     {
4689         if (i.p)
4690             i.p->addref();
4691         if (p)
4692             p->release();
4693         p = i.p;
4694     }
4695     return *this;
4696 }
4697
4698 Image2D::~Image2D()
4699 {
4700     if (p)
4701         p->release();
4702 }
4703
4704 void* Image2D::ptr() const
4705 {
4706     return p ? p->handle : 0;
4707 }
4708
4709 }}