From: Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Date: Tue, 22 Oct 2013 10:04:49 +0000 (+0400)
Subject: the first draft of transparent API and new UMat class.
X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~3708^2~13
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=278fb617d21ca5af6a0b41a25909587d999c2cdc;p=platform%2Fupstream%2Fopencv.git

the first draft of transparent API and new UMat class.
---

diff --git a/modules/core/include/opencv2/core/ocl.hpp b/modules/core/include/opencv2/core/ocl.hpp
new file mode 100644
index 0000000..0f62c08
--- /dev/null
+++ b/modules/core/include/opencv2/core/ocl.hpp
@@ -0,0 +1,462 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_OPENCL_HPP__
+#define __OPENCV_OPENCL_HPP__
+
+#include "opencv2/core.hpp"
+
+namespace cv { namespace ocl {
+
+CV_EXPORTS bool haveOpenCL();
+CV_EXPORTS bool useOpenCL();
+CV_EXPORTS void setUseOpenCL(bool flag);
+CV_EXPORTS void finish();
+
+class CV_EXPORTS Context;
+class CV_EXPORTS Device;
+class CV_EXPORTS Kernel;
+class CV_EXPORTS Program;
+class CV_EXPORTS ProgramSource;
+class CV_EXPORTS Queue;
+    
+class CV_EXPORTS Device
+{
+public:
+    Device();
+    explicit Device(void* d);
+    Device(const Device& d);
+    Device& operator = (const Device& d);
+    ~Device();
+
+    void set(void* d);
+
+    enum
+    {
+        TYPE_DEFAULT     = (1 << 0),
+        TYPE_CPU         = (1 << 1),
+        TYPE_GPU         = (1 << 2),
+        TYPE_ACCELERATOR = (1 << 3),
+        TYPE_DGPU        = TYPE_GPU + (1 << 16),
+        TYPE_IGPU        = TYPE_GPU + (1 << 17),
+        TYPE_ALL         = 0xFFFFFFFF
+    };
+
+    String name() const;
+    String extensions() const;
+    String vendor() const;
+    String OpenCL_C_Version() const;
+    String OpenCLVersion() const;
+    String driverVersion() const;
+    void* ptr() const;
+
+    int type() const;
+
+    int addressBits() const;
+    bool available() const;
+    bool compilerAvailable() const;
+    bool linkerAvailable() const;
+
+    enum
+    {
+        FP_DENORM=(1 << 0),
+        FP_INF_NAN=(1 << 1),
+        FP_ROUND_TO_NEAREST=(1 << 2),
+        FP_ROUND_TO_ZERO=(1 << 3),
+        FP_ROUND_TO_INF=(1 << 4),
+        FP_FMA=(1 << 5),
+        FP_SOFT_FLOAT=(1 << 6),
+        FP_CORRECTLY_ROUNDED_DIVIDE_SQRT=(1 << 7)
+    };
+    int doubleFPConfig() const;
+    int singleFPConfig() const;
+    int halfFPConfig() const;
+
+    bool endianLittle() const;
+    bool errorCorrectionSupport() const;
+
+    enum
+    {
+        EXEC_KERNEL=(1 << 0),
+        EXEC_NATIVE_KERNEL=(1 << 1)
+    };
+    int executionCapabilities() const;
+
+    size_t globalMemCacheSize() const;
+
+    enum
+    {
+        NO_CACHE=0,
+        READ_ONLY_CACHE=1,
+        READ_WRITE_CACHE=2
+    };
+    int globalMemCacheType() const;
+    int globalMemCacheLineSize() const;
+    size_t globalMemSize() const;
+
+    size_t localMemSize() const;
+    enum
+    {
+        NO_LOCAL_MEM=0,
+        LOCAL_IS_LOCAL=1,
+        LOCAL_IS_GLOBAL=2
+    };
+    int localMemType() const;
+    bool hostUnifiedMemory() const;
+
+    bool imageSupport() const;
+
+    size_t image2DMaxWidth() const;
+    size_t image2DMaxHeight() const;
+
+    size_t image3DMaxWidth() const;
+    size_t image3DMaxHeight() const;
+    size_t image3DMaxDepth() const;
+
+    size_t imageMaxBufferSize() const;
+    size_t imageMaxArraySize() const;
+
+    int maxClockFrequency() const;
+    int maxComputeUnits() const;
+    int maxConstantArgs() const;
+    size_t maxConstantBufferSize() const;
+
+    size_t maxMemAllocSize() const;
+    size_t maxParameterSize() const;
+
+    int maxReadImageArgs() const;
+    int maxWriteImageArgs() const;
+    int maxSamplers() const;
+
+    size_t maxWorkGroupSize() const;
+    int maxWorkItemDims() const;
+    void maxWorkItemSizes(size_t*) const;
+
+    int memBaseAddrAlign() const;
+
+    int nativeVectorWidthChar() const;
+    int nativeVectorWidthShort() const;
+    int nativeVectorWidthInt() const;
+    int nativeVectorWidthLong() const;
+    int nativeVectorWidthFloat() const;
+    int nativeVectorWidthDouble() const;
+    int nativeVectorWidthHalf() const;
+
+    int preferredVectorWidthChar() const;
+    int preferredVectorWidthShort() const;
+    int preferredVectorWidthInt() const;
+    int preferredVectorWidthLong() const;
+    int preferredVectorWidthFloat() const;
+    int preferredVectorWidthDouble() const;
+    int preferredVectorWidthHalf() const;
+
+    size_t printfBufferSize() const;
+    size_t profilingTimerResolution() const;
+
+    static const Device& getDefault();
+
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+
+class CV_EXPORTS Context
+{
+public:
+    Context();
+    explicit Context(int dtype);
+    ~Context();
+    Context(const Context& c);
+    Context& operator = (const Context& c);
+
+    bool create(int dtype);
+    size_t ndevices() const;
+    const Device& device(size_t idx) const;
+    int dtype() const;
+    Program getProg(const ProgramSource& prog,
+                    const String& buildopt, String& errmsg);
+    
+    static Context& getDefault();
+    void* ptr() const;
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+
+class CV_EXPORTS Queue
+{
+public:
+    Queue();
+    explicit Queue(const Context& c, const Device& d=Device());
+    ~Queue();
+    Queue(const Queue& q);
+    Queue& operator = (const Queue& q);
+    
+    bool create(const Context& c=Context(), const Device& d=Device());
+    void finish();
+    void* ptr() const;
+    static Queue& getDefault();
+    
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+
+class CV_EXPORTS KernelArg
+{
+public:
+    enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8 };
+    KernelArg(int _flags, UMat* _m, void* _obj=0, size_t _sz=0);
+
+    static KernelArg Local() { return KernelArg(LOCAL, 0); }
+    static KernelArg ReadOnly(const UMat& m) { return KernelArg(READ_ONLY, (UMat*)&m); }
+    static KernelArg WriteOnly(const UMat& m) { return KernelArg(WRITE_ONLY, (UMat*)&m); }
+    static KernelArg Constant(const Mat& m);
+    template<typename _Tp> static KernelArg Constant(const _Tp* arr, size_t n)
+    { return KernelArg(CONSTANT, 0, (void*)arr, n); }
+
+    int flags;
+    UMat* m;
+    void* obj;
+    size_t sz;
+};
+
+class CV_EXPORTS Kernel
+{
+public:
+    class CV_EXPORTS Callback
+    {
+    public:
+        virtual ~Callback() {}
+        virtual void operator()() = 0;
+    };
+
+    Kernel();
+    Kernel(const char* kname, const Program& prog);
+    Kernel(const char* kname, const ProgramSource& prog,
+           const String& buildopts, String& errmsg);
+    ~Kernel();
+    Kernel(const Kernel& k);
+    Kernel& operator = (const Kernel& k);
+
+    bool create(const char* kname, const Program& prog);
+    bool create(const char* kname, const ProgramSource& prog,
+                const String& buildopts, String& errmsg);
+
+    int set(int i, const void* value, size_t sz);
+    int set(int i, const UMat& m);
+    int set(int i, const KernelArg& arg);
+    template<typename _Tp> int set(int i, const _Tp& value)
+    { return set(i, &value, sizeof(value)); }
+
+    template<typename _Tp1>
+    Kernel& args(_Tp1 a1)
+    {
+        set(0, a1); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2>
+    Kernel& args(_Tp1 a1, _Tp2 a2)
+    {
+        int i = set(0, a1); set(i, a2); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3)
+    {
+        int i = set(0, a1); i = set(i, a2); set(i, a3); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); set(i, a4);
+        return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4); set(i, a5);
+        return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3,
+             typename _Tp4, typename _Tp5, typename _Tp6>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); set(i, a6); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
+             typename _Tp5, typename _Tp6, typename _Tp7>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); set(i, a7); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4,
+             typename _Tp5, typename _Tp6, typename _Tp7, typename _Tp8>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, _Tp8 a8)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); set(i, a8);
+        return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
+             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7, _Tp8 a8, _Tp9 a9)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
+        set(i, a9); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
+             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9, typename _Tp10>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7,
+                 _Tp8 a8, _Tp9 a9, _Tp10 a10)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
+        i = set(i, a9); set(i, a10); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
+             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9,
+             typename _Tp10, typename _Tp11>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7,
+                 _Tp8 a8, _Tp9 a9, _Tp10 a10, _Tp11 a11)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
+        i = set(i, a9); i = set(i, a10); set(i, a11); return *this;
+    }
+
+    template<typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4, typename _Tp5,
+             typename _Tp6, typename _Tp7, typename _Tp8, typename _Tp9,
+             typename _Tp10, typename _Tp11, typename _Tp12>
+    Kernel& args(_Tp1 a1, _Tp2 a2, _Tp3 a3, _Tp4 a4, _Tp5 a5, _Tp6 a6, _Tp7 a7,
+                 _Tp8 a8, _Tp9 a9, _Tp10 a10, _Tp11 a11, _Tp12 a12)
+    {
+        int i = set(0, a1); i = set(i, a2); i = set(i, a3); i = set(i, a4);
+        i = set(i, a5); i = set(i, a6); i = set(i, a7); i = set(i, a8);
+        i = set(i, a9); i = set(i, a10); i = set(i, a11); set(i, a12);
+        return *this;
+    }
+
+    void run(int dims, size_t offset[],
+             size_t globalsize[], size_t localsize[], bool sync,
+             const Ptr<Callback>& cleanupCallback=Ptr<Callback>(),
+             const Queue& q=Queue());
+    void runTask(bool sync,
+                 const Ptr<Callback>& cleanupCallback=Ptr<Callback>(),
+                 const Queue& q=Queue());
+
+    size_t workGroupSize() const;
+    bool compileWorkGroupSize(size_t wsz[]) const;
+    size_t localMemSize() const;
+
+    void* ptr() const;
+    struct Impl;
+
+protected:
+    Impl* p;
+};
+
+class CV_EXPORTS Program
+{
+public:
+    Program();
+    Program(const ProgramSource& src,
+            const String& buildflags, String& errmsg);
+    explicit Program(const String& buf);
+    Program(const Program& prog);
+
+    Program& operator = (const Program& prog);
+    ~Program();
+
+    bool create(const ProgramSource& src,
+                const String& buildflags, String& errmsg);
+    bool read(const String& buf, const String& buildflags);
+    bool write(String& buf) const;
+
+    const ProgramSource& source() const;
+    void* ptr() const;
+
+    String getPrefix() const;
+    static String getPrefix(const String& buildflags);
+
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+
+class CV_EXPORTS ProgramSource
+{
+public:
+    typedef uint64 hash_t;
+
+    ProgramSource();
+    explicit ProgramSource(const String& prog);
+    explicit ProgramSource(const char* prog);
+    ~ProgramSource();
+    ProgramSource(const ProgramSource& prog);
+    ProgramSource& operator = (const ProgramSource& prog);
+
+    const String& source() const;
+    hash_t hash() const;
+    
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+}}
+
+#endif
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp
new file mode 100644
index 0000000..fdcedfa
--- /dev/null
+++ b/modules/core/src/ocl.cpp
@@ -0,0 +1,2901 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the OpenCV Foundation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include <map>
+
+/*
+  Part of the file is an extract from the standard OpenCL headers from Khronos site.
+  Below is the original copyright.
+*/
+
+/*******************************************************************************
+ * Copyright (c) 2008 - 2012 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#if 0 //defined __APPLE__
+#define HAVE_OPENCL 1
+#else
+#undef HAVE_OPENCL
+#endif
+
+#define OPENCV_CL_NOT_IMPLEMENTED -1000
+
+#ifdef HAVE_OPENCL
+
+#if defined __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+static const bool g_haveOpenCL = true;
+
+#else
+
+extern "C" {
+
+struct _cl_platform_id { int dummy; };
+struct _cl_device_id { int dummy; };
+struct _cl_context { int dummy; };
+struct _cl_command_queue { int dummy; };
+struct _cl_mem { int dummy; };
+struct _cl_program { int dummy; };
+struct _cl_kernel { int dummy; };
+struct _cl_event { int dummy; };
+struct _cl_sampler { int dummy; };
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef int cl_int;
+typedef unsigned cl_uint;
+typedef long cl_long;
+typedef unsigned long cl_ulong;
+
+typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+    cl_mem                  buffer;
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+
+//////////////////////////////////////////////////////////
+
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+
+/*#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1*/
+
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+#define CL_DEVICE_TYPE                              0x1000
+#define CL_DEVICE_VENDOR_ID                         0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
+#define CL_DEVICE_ADDRESS_BITS                      0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                     0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
+#define CL_DEVICE_MAX_SAMPLERS                      0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                     0x1026
+#define CL_DEVICE_AVAILABLE                         0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
+#define CL_DEVICE_NAME                              0x102B
+#define CL_DEVICE_VENDOR                            0x102C
+#define CL_DRIVER_VERSION                           0x102D
+#define CL_DEVICE_PROFILE                           0x102E
+#define CL_DEVICE_VERSION                           0x102F
+#define CL_DEVICE_EXTENSIONS                        0x1030
+#define CL_DEVICE_PLATFORM                          0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                  0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                  0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
+#define CL_DEVICE_PARENT_DEVICE                     0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES              0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
+#define CL_DEVICE_PARTITION_TYPE                    0x1046
+#define CL_DEVICE_REFERENCE_COUNT                   0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
+
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+// reserved                                         (1 << 6)    
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+
+#define CL_CALLBACK CV_STDCALL
+
+static volatile bool g_haveOpenCL = false;
+static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
+
+#if defined(__APPLE__)
+#include <dlfcn.h>
+
+static void* initOpenCLAndLoad(const char* funcname)
+{
+    static bool initialized = false;
+    static void* handle = 0;
+    if (!handle)
+    {
+        if(!initialized)
+        {
+            handle = dlopen("/System/Library/Frameworks/OpenÐ¡L.framework/Versions/Current/OpenÐ¡L", RTLD_LAZY);
+            initialized = true;
+            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
+        }
+        if(!handle)
+            return 0;
+    }
+
+    return funcname ? dlsym(handle, funcname) : 0;
+}
+
+#elif defined WIN32 || defined _WIN32
+
+#ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
+  #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
+#endif
+#include <windows.h>
+#if (_WIN32_WINNT >= 0x0602)
+  #include <synchapi.h>
+#endif
+#undef small
+#undef min
+#undef max
+#undef abs
+
+static void* initOpenCLAndLoad(const char* funcname)
+{
+    static bool initialized = false;
+    static HMODULE handle = 0;
+    if (!handle)
+    {
+        if(!initialized)
+        {
+            handle = LoadLibraryA("OpenCL.dll");
+            initialized = true;
+            g_haveOpenCL = handle != 0 && GetProcAddressA(handle, oclFuncToCheck) != 0;
+        }
+        if(!handle)
+            return 0;
+    }
+    
+    return funcname ? (void*)GetProcAddressA(handle, funcname) : 0;
+}
+
+#elif defined(__linux)
+
+#include <dlfcn.h>
+#include <stdio.h>
+
+static void* initOpenCLAndLoad(const char* funcname)
+{
+    static bool initialized = false;
+    static void* handle = 0;
+    if (!handle)
+    {
+        if(!initialized)
+        {
+            handle = dlopen("libOpenCL.so");
+            if(!handle)
+                handle = dlopen("libCL.so");
+            initialized = true;
+            g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
+        }
+        if(!handle)
+            return 0;
+    }
+    
+    return funcname ? (void*)dlsym(handle, funcname) : 0;
+}
+
+#else
+
+static void* initOpenCLAndLoad(const char*)
+{
+    return 0;
+}
+
+#endif
+
+
+#define OCL_FUNC(rettype, funcname, argsdecl, args) \
+    typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
+    static rettype funcname argsdecl \
+    { \
+        static funcname##_t funcname##_p = 0; \
+        if( !funcname##_p ) \
+        { \
+            funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
+            if( !funcname##_p ) \
+                return OPENCV_CL_NOT_IMPLEMENTED; \
+        } \
+        return funcname##_p args; \
+    }
+
+
+#define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
+    typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
+    static rettype funcname argsdecl \
+    { \
+        static funcname##_t funcname##_p = 0; \
+        if( !funcname##_p ) \
+        { \
+            funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
+            if( !funcname##_p ) \
+            { \
+                if( errcode_ret ) \
+                    *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
+                return 0; \
+            } \
+        } \
+        return funcname##_p args; \
+    }
+
+OCL_FUNC(cl_int, clGetPlatformIDs,
+    (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
+    (num_entries, platforms, num_platforms))
+
+OCL_FUNC(cl_int, clGetPlatformInfo,
+    (cl_platform_id platform, cl_platform_info param_name,
+    size_t param_value_size, void * param_value,
+    size_t * param_value_size_ret),
+    (platform, param_name, param_value_size, param_value, param_value_size_ret))
+
+OCL_FUNC(cl_int, clGetDeviceInfo,
+         (cl_device_id device,
+          cl_device_info param_name,
+          size_t param_value_size,
+          void * param_value,
+          size_t * param_value_size_ret),
+         (device, param_name, param_value_size, param_value, param_value_size_ret))
+
+
+OCL_FUNC(cl_int, clGetDeviceIDs,
+    (cl_platform_id platform,
+    cl_device_type device_type, 
+    cl_uint num_entries, 
+    cl_device_id * devices, 
+    cl_uint * num_devices),
+    (platform, device_type, num_entries, devices, num_devices))
+
+OCL_FUNC_P(cl_context, clCreateContext,
+    (const cl_context_properties * properties,
+    cl_uint num_devices,
+    const cl_device_id * devices,
+    void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
+    void * user_data,
+    cl_int * errcode_ret),
+    (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
+
+OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
+
+/*
+OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
+
+OCL_FUNC_P(cl_context, clCreateContextFromType,
+    (const cl_context_properties * properties,
+    cl_device_type device_type,
+    void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
+    void * user_data,
+    cl_int * errcode_ret),
+    (properties, device_type, pfn_notify, user_data, errcode_ret))
+
+OCL_FUNC(cl_int, clGetContextInfo,
+    (cl_context context, 
+    cl_context_info param_name, 
+    size_t param_value_size, 
+    void * param_value, 
+    size_t * param_value_size_ret),
+    (context, param_name, param_value_size, 
+    param_value, param_value_size_ret))
+*/
+OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
+    (cl_context context, 
+    cl_device_id device, 
+    cl_command_queue_properties properties,
+    cl_int * errcode_ret),
+    (context, device, properties, errcode_ret))
+
+OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
+
+OCL_FUNC_P(cl_mem, clCreateBuffer,
+    (cl_context context,
+    cl_mem_flags flags,
+    size_t size,
+    void * host_ptr,
+    cl_int * errcode_ret),
+    (context, flags, size, host_ptr, errcode_ret))
+
+/*
+OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
+
+OCL_FUNC(cl_int, clGetCommandQueueInfo,
+ (cl_command_queue command_queue,
+ cl_command_queue_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret),
+ (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
+
+OCL_FUNC_P(cl_mem, clCreateSubBuffer,
+    (cl_mem buffer,
+    cl_mem_flags flags,
+    cl_buffer_create_type buffer_create_type,
+    const void * buffer_create_info,
+    cl_int * errcode_ret),
+    (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
+
+OCL_FUNC_P(cl_mem, clCreateImage,
+    (cl_context context,
+    cl_mem_flags flags,
+    const cl_image_format * image_format,
+    const cl_image_desc * image_desc, 
+    void * host_ptr,
+    cl_int * errcode_ret),
+    (context, flags, image_format, image_desc, host_ptr, errcode_ret))
+
+OCL_FUNC(cl_int, clGetSupportedImageFormats,
+ (cl_context context,
+ cl_mem_flags flags,
+ cl_mem_object_type image_type,
+ cl_uint num_entries,
+ cl_image_format * image_formats,
+ cl_uint * num_image_formats),
+ (context, flags, image_type, num_entries, image_formats, num_image_formats))
+
+OCL_FUNC(cl_int, clGetMemObjectInfo,
+ (cl_mem memobj,
+ cl_mem_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret),
+ (memobj, param_name, param_value_size, param_value, param_value_size_ret))
+
+OCL_FUNC(cl_int, clGetImageInfo,
+ (cl_mem image,
+ cl_image_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret),
+ (image, param_name, param_value_size, param_value, param_value_size_ret))
+
+OCL_FUNC(cl_int, clCreateKernelsInProgram,
+ (cl_program program,
+ cl_uint num_kernels,
+ cl_kernel * kernels,
+ cl_uint * num_kernels_ret),
+ (program, num_kernels, kernels, num_kernels_ret))
+
+OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
+
+OCL_FUNC(cl_int, clGetKernelArgInfo,
+ (cl_kernel kernel,
+ cl_uint arg_indx,
+ cl_kernel_arg_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret),
+ (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
+ 
+OCL_FUNC(cl_int, clEnqueueReadImage,
+ (cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_read,
+ const size_t * origin[3],
+ const size_t * region[3],
+ size_t row_pitch,
+ size_t slice_pitch,
+ void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event),
+ (command_queue, image, blocking_read, origin, region,
+ row_pitch, slice_pitch,
+ ptr,
+ num_events_in_wait_list,
+ event_wait_list,
+ event))
+
+OCL_FUNC(cl_int, clEnqueueWriteImage,
+ (cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_write,
+ const size_t * origin[3],
+ const size_t * region[3],
+ size_t input_row_pitch,
+ size_t input_slice_pitch,
+ const void * ptr,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event),
+ (command_queue, image, blocking_write, origin, region, input_row_pitch,
+ input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueFillImage,
+ (cl_command_queue command_queue,
+ cl_mem image,
+ const void * fill_color,
+ const size_t * origin[3],
+ const size_t * region[3],
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event),
+ (command_queue, image, fill_color, origin, region,
+ num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueCopyImage,
+ (cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_image,
+ const size_t * src_origin[3],
+ const size_t * dst_origin[3],
+ const size_t * region[3],
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event),
+ (command_queue, src_image, dst_image, src_origin, dst_origin,
+ region, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
+ (cl_command_queue command_queue,
+ cl_mem src_image,
+ cl_mem dst_buffer,
+ const size_t * src_origin[3],
+ const size_t * region[3],
+ size_t dst_offset,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event),
+ (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
+ num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
+ (cl_command_queue command_queue,
+ cl_mem src_buffer,
+ cl_mem dst_image,
+ size_t src_offset,
+ const size_t * dst_origin[3],
+ const size_t * region[3],
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event),
+ (command_queue, src_buffer, dst_image, src_offset, dst_origin,
+ region, num_events_in_wait_list, event_wait_list, event))
+
+
+OCL_FUNC_P(void*, clEnqueueMapImage,
+ (cl_command_queue command_queue,
+ cl_mem image,
+ cl_bool blocking_map,
+ cl_map_flags map_flags,
+ const size_t * origin[3],
+ const size_t * region[3],
+ size_t * image_row_pitch,
+ size_t * image_slice_pitch,
+ cl_uint num_events_in_wait_list,
+ const cl_event * event_wait_list,
+ cl_event * event,
+ cl_int * errcode_ret),
+ (command_queue, image, blocking_map, map_flags, origin, region,
+ image_row_pitch, image_slice_pitch, num_events_in_wait_list,
+ event_wait_list, event, errcode_ret))
+
+OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
+ 
+OCL_FUNC(cl_int, clGetKernelInfo,
+ (cl_kernel kernel,
+ cl_kernel_info param_name,
+ size_t param_value_size,
+ void * param_value,
+ size_t * param_value_size_ret),
+ (kernel, param_name, param_value_size, param_value, param_value_size_ret))
+
+OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
+
+*/
+
+OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
+
+
+OCL_FUNC_P(cl_program, clCreateProgramWithSource,
+    (cl_context context,
+    cl_uint count,
+    const char ** strings,
+    const size_t * lengths,
+    cl_int * errcode_ret),
+    (context, count, strings, lengths, errcode_ret))
+
+OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
+    (cl_context context,
+    cl_uint num_devices,
+    const cl_device_id * device_list,
+    const size_t * lengths,
+    const unsigned char ** binaries,
+    cl_int * binary_status,
+    cl_int * errcode_ret),
+    (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
+
+OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
+
+OCL_FUNC(cl_int, clBuildProgram,
+    (cl_program program,
+    cl_uint num_devices,
+    const cl_device_id * device_list,
+    const char * options, 
+    void (CL_CALLBACK * pfn_notify)(cl_program, void *),
+    void * user_data),
+    (program, num_devices, device_list, options, pfn_notify, user_data))
+
+OCL_FUNC(cl_int, clGetProgramInfo,
+    (cl_program program,
+    cl_program_info param_name,
+    size_t param_value_size,
+    void * param_value,
+    size_t * param_value_size_ret),
+    (program, param_name, param_value_size, param_value, param_value_size_ret))
+
+OCL_FUNC(cl_int, clGetProgramBuildInfo,
+    (cl_program program,
+    cl_device_id device,
+    cl_program_build_info param_name,
+    size_t param_value_size,
+    void * param_value,
+    size_t * param_value_size_ret),
+    (program, device, param_name, param_value_size, param_value, param_value_size_ret))
+                      
+OCL_FUNC_P(cl_kernel, clCreateKernel,
+    (cl_program program,
+    const char * kernel_name,
+    cl_int * errcode_ret),
+    (program, kernel_name, errcode_ret))
+
+OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
+
+OCL_FUNC(cl_int, clSetKernelArg,
+    (cl_kernel kernel,
+    cl_uint arg_index,
+    size_t arg_size,
+    const void * arg_value),
+    (kernel, arg_index, arg_size, arg_value))
+
+OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
+    (cl_kernel kernel,
+    cl_device_id device,
+    cl_kernel_work_group_info param_name,
+    size_t param_value_size,
+    void * param_value,
+    size_t * param_value_size_ret),
+    (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
+                         
+OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
+
+OCL_FUNC(cl_int, clEnqueueReadBuffer,
+    (cl_command_queue command_queue,
+    cl_mem buffer,
+    cl_bool blocking_read,
+    size_t offset,
+    size_t size, 
+    void * ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, buffer, blocking_read, offset, size, ptr,
+    num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueReadBufferRect,
+    (cl_command_queue command_queue,
+    cl_mem buffer,
+    cl_bool blocking_read,
+    const size_t * buffer_offset,
+    const size_t * host_offset, 
+    const size_t * region,
+    size_t buffer_row_pitch,
+    size_t buffer_slice_pitch,
+    size_t host_row_pitch,
+    size_t host_slice_pitch,                        
+    void * ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
+    buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
+    event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueWriteBuffer,
+    (cl_command_queue command_queue, 
+    cl_mem buffer, 
+    cl_bool blocking_write, 
+    size_t offset, 
+    size_t size, 
+    const void * ptr, 
+    cl_uint num_events_in_wait_list, 
+    const cl_event * event_wait_list, 
+    cl_event * event),
+    (command_queue, buffer, blocking_write, offset, size, ptr,
+    num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
+    (cl_command_queue command_queue,
+    cl_mem buffer,
+    cl_bool blocking_write,
+    const size_t * buffer_offset,
+    const size_t * host_offset, 
+    const size_t * region,
+    size_t buffer_row_pitch,
+    size_t buffer_slice_pitch,
+    size_t host_row_pitch,
+    size_t host_slice_pitch,                        
+    const void * ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, buffer, blocking_write, buffer_offset, host_offset,
+    region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
+    host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueFillBuffer,
+    (cl_command_queue command_queue,
+    cl_mem buffer, 
+    const void * pattern, 
+    size_t pattern_size, 
+    size_t offset, 
+    size_t size, 
+    cl_uint num_events_in_wait_list, 
+    const cl_event * event_wait_list, 
+    cl_event * event),
+    (command_queue, buffer, pattern, pattern_size, offset, size, 
+    num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueCopyBuffer,
+    (cl_command_queue command_queue, 
+    cl_mem src_buffer,
+    cl_mem dst_buffer, 
+    size_t src_offset,
+    size_t dst_offset,
+    size_t size, 
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
+    size, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
+    (cl_command_queue command_queue, 
+    cl_mem src_buffer,
+    cl_mem dst_buffer, 
+    const size_t * src_origin,
+    const size_t * dst_origin,
+    const size_t * region, 
+    size_t src_row_pitch,
+    size_t src_slice_pitch,
+    size_t dst_row_pitch,
+    size_t dst_slice_pitch,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
+    region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
+    num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC_P(void*, clEnqueueMapBuffer,
+    (cl_command_queue command_queue,
+    cl_mem buffer,
+    cl_bool blocking_map, 
+    cl_map_flags map_flags,
+    size_t offset,
+    size_t size,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event,
+    cl_int * errcode_ret),
+    (command_queue, buffer, blocking_map, map_flags, offset, size,
+    num_events_in_wait_list, event_wait_list, event, errcode_ret))
+
+OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
+    (cl_command_queue command_queue,
+    cl_mem memobj,
+    void * mapped_ptr,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
+    (cl_command_queue command_queue,
+    cl_kernel kernel,
+    cl_uint work_dim,
+    const size_t * global_work_offset,
+    const size_t * global_work_size,
+    const size_t * local_work_size,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, kernel, work_dim, global_work_offset, global_work_size,
+    local_work_size, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clEnqueueTask,
+    (cl_command_queue command_queue,
+    cl_kernel kernel,
+    cl_uint num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event * event),
+    (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
+
+OCL_FUNC(cl_int, clSetEventCallback,
+    (cl_event event,
+    cl_int command_exec_callback_type ,
+    void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
+    void *user_data),
+    (event, command_exec_callback_type, pfn_event_notify, user_data))
+
+OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
+
+}
+
+#endif
+
+namespace cv { namespace ocl {
+
+// Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
+static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
+{
+    static uint64 table[256];
+    static bool initialized = false;
+
+    if( !initialized )
+    {
+        for( int i = 0; i < 256; i++ )
+        {
+            uint64 c = i;
+            for( int j = 0; j < 8; j++ )
+                c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
+            table[i] = c;
+        }
+        initialized = true;
+    }
+
+    uint64 crc = ~crc0;
+    for( size_t idx = 0; idx < size; idx++ )
+        crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
+
+    return ~crc;
+}
+
+struct HashKey
+{
+    typedef uint64 part;
+    HashKey(part _a, part _b) : a(_a), b(_b) {}
+    part a, b;
+};
+
+inline bool operator == (const HashKey& h1, const HashKey& h2)
+{
+    return h1.a == h2.a && h1.b == h2.b;
+}
+
+inline bool operator < (const HashKey& h1, const HashKey& h2)
+{
+    return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
+}
+
+bool haveOpenCL()
+{
+    initOpenCLAndLoad(0);
+    return g_haveOpenCL;
+}
+
+bool useOpenCL()
+{
+    TLSData* data = TLSData::get();
+    if( data->useOpenCL < 0 )
+        data->useOpenCL = (int)haveOpenCL();
+    return data->useOpenCL > 0;
+}
+
+void finish()
+{
+    Queue::getDefault().finish();
+}
+
+#define IMPLEMENT_REFCOUNTABLE() \
+    void addref() { CV_XADD(&refcount, 1); } \
+    void release() { if( CV_XADD(&refcount, -1) == 1 ) delete this; } \
+    int refcount
+
+class Platform
+{
+public:
+    Platform();
+    ~Platform();
+    Platform(const Platform& p);
+    Platform& operator = (const Platform& p);
+
+    void* ptr() const;
+    static Platform& getDefault();
+protected:
+    struct Impl;
+    Impl* p;
+};
+
+struct Platform::Impl
+{
+    Impl()
+    {
+        refcount = 1;
+        handle = 0;
+        initialized = false;
+    }
+
+    ~Impl() {}
+
+    void init()
+    {
+        if( !initialized )
+        {
+            //cl_uint num_entries
+            cl_uint n = 0;
+            if( clGetPlatformIDs(1, &handle, &n) < 0 || n == 0 )
+                handle = 0;
+            if( handle != 0 )
+            {
+                char buf[1000];
+                size_t len = 0;
+                clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len);
+                buf[len] = '\0';
+                vendor = String(buf);
+            }
+
+            initialized = true;
+        }
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_platform_id handle;
+    String vendor;
+    bool initialized;
+};
+
+Platform::Platform()
+{
+    p = 0;
+}
+
+Platform::~Platform()
+{
+    if(p)
+        p->release();
+}
+
+Platform::Platform(const Platform& pl)
+{
+    p = (Impl*)pl.p;
+    if(p)
+        p->addref();
+}
+
+Platform& Platform::operator = (const Platform& pl)
+{
+    Impl* newp = (Impl*)pl.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+void* Platform::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
+Platform& Platform::getDefault()
+{
+    static Platform p;
+    if( !p.p )
+    {
+        p.p = new Impl;
+        p.p->init();
+    }
+    return p;
+}
+
+///////////////////////////////////////////////////////////////////////////////////
+
+struct Device::Impl
+{
+    Impl(void* d)
+    {
+        handle = (cl_device_id)d;
+    }
+
+    template<typename _TpCL, typename _TpOut>
+    _TpOut getProp(cl_device_info prop) const
+    {
+        _TpCL temp=_TpCL();
+        size_t sz = 0;
+
+        return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) >= 0 &&
+            sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
+    }
+
+    String getStrProp(cl_device_info prop) const
+    {
+        char buf[1024];
+        size_t sz=0;
+        return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) >= 0 &&
+            sz < sizeof(buf) ? String(buf) : String();
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+    cl_device_id handle;
+};
+
+
+Device::Device()
+{
+    p = 0;
+}
+
+Device::Device(void* d)
+{
+    p = 0;
+    set(d);
+}
+
+Device::Device(const Device& d)
+{
+    p = d.p;
+    if(p)
+        p->addref();
+}
+
+Device& Device::operator = (const Device& d)
+{
+    Impl* newp = (Impl*)d.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+Device::~Device()
+{
+    if(p)
+        p->release();
+}
+
+void Device::set(void* d)
+{
+    if(p)
+        p->release();
+    p = new Impl(d);
+}
+
+void* Device::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
+String Device::name() const
+{ return p ? p->getStrProp(CL_DEVICE_NAME) : String(); }
+
+String Device::extensions() const
+{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
+
+String Device::vendor() const
+{ return p ? p->getStrProp(CL_DEVICE_VENDOR) : String(); }
+
+String Device::OpenCL_C_Version() const
+{ return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
+
+String Device::OpenCLVersion() const
+{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
+
+String Device::driverVersion() const
+{ return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
+
+int Device::type() const
+{ return p ? p->getProp<cl_device_type, int>(CL_DEVICE_TYPE) : 0; }
+
+int Device::addressBits() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
+
+bool Device::available() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_AVAILABLE) : 0; }
+
+bool Device::compilerAvailable() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_COMPILER_AVAILABLE) : 0; }
+
+bool Device::linkerAvailable() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_LINKER_AVAILABLE) : 0; }
+
+int Device::doubleFPConfig() const
+{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG) : 0; }
+
+int Device::singleFPConfig() const
+{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
+
+int Device::halfFPConfig() const
+{ return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
+
+bool Device::endianLittle() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_ENDIAN_LITTLE) : 0; }
+
+bool Device::errorCorrectionSupport() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : 0; }
+
+int Device::executionCapabilities() const
+{ return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
+
+size_t Device::globalMemCacheSize() const
+{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
+
+int Device::globalMemCacheType() const
+{ return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
+
+int Device::globalMemCacheLineSize() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
+
+size_t Device::globalMemSize() const
+{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
+
+size_t Device::localMemSize() const
+{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
+
+int Device::localMemType() const
+{ return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
+
+bool Device::hostUnifiedMemory() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_HOST_UNIFIED_MEMORY) : 0; }
+
+bool Device::imageSupport() const
+{ return p ? p->getProp<cl_bool, bool>(CL_DEVICE_IMAGE_SUPPORT) : 0; }
+
+size_t Device::image2DMaxWidth() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
+
+size_t Device::image2DMaxHeight() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
+
+size_t Device::image3DMaxWidth() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
+
+size_t Device::image3DMaxHeight() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
+
+size_t Device::image3DMaxDepth() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
+
+size_t Device::imageMaxBufferSize() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
+
+size_t Device::imageMaxArraySize() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
+
+int Device::maxClockFrequency() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
+
+int Device::maxComputeUnits() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS) : 0; }
+
+int Device::maxConstantArgs() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
+
+size_t Device::maxConstantBufferSize() const
+{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
+
+size_t Device::maxMemAllocSize() const
+{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
+
+size_t Device::maxParameterSize() const
+{ return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
+
+int Device::maxReadImageArgs() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
+
+int Device::maxWriteImageArgs() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
+
+int Device::maxSamplers() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
+
+size_t Device::maxWorkGroupSize() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE) : 0; }
+
+int Device::maxWorkItemDims() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
+
+void Device::maxWorkItemSizes(size_t* sizes) const
+{
+    if(p)
+    {
+        const int MAX_DIMS = 32;
+        size_t retsz = 0;
+        clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz);
+    }
+}
+
+int Device::memBaseAddrAlign() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
+
+int Device::nativeVectorWidthChar() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
+
+int Device::nativeVectorWidthShort() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
+
+int Device::nativeVectorWidthInt() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
+
+int Device::nativeVectorWidthLong() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
+
+int Device::nativeVectorWidthFloat() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
+
+int Device::nativeVectorWidthDouble() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
+
+int Device::nativeVectorWidthHalf() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
+
+int Device::preferredVectorWidthChar() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
+
+int Device::preferredVectorWidthShort() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
+
+int Device::preferredVectorWidthInt() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
+
+int Device::preferredVectorWidthLong() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
+
+int Device::preferredVectorWidthFloat() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
+
+int Device::preferredVectorWidthDouble() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
+
+int Device::preferredVectorWidthHalf() const
+{ return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
+
+size_t Device::printfBufferSize() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
+
+size_t Device::profilingTimerResolution() const
+{ return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
+
+const Device& Device::getDefault()
+{
+    const Context& ctx = Context::getDefault();
+    int idx = TLSData::get()->device;
+    return ctx.device(idx);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////
+
+struct Context::Impl
+{
+    Impl(int dtype0)
+    {
+        refcount = 1;
+        handle = 0;
+
+        cl_int retval = 0;
+        cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
+        cl_context_properties prop[] =
+        {
+            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
+            0
+        };
+
+        cl_uint i, nd0 = 0, nd = 0;
+        int dtype = dtype0 & 15;
+        clGetDeviceIDs( pl, dtype, 0, 0, &nd0 );
+        if(retval < 0)
+            return;
+        AutoBuffer<void*> dlistbuf(nd0*2+1);
+        cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
+        cl_device_id* dlist_new = dlist + nd0;
+        clGetDeviceIDs(	pl, dtype, nd0, dlist, &nd0 );
+        String name0;
+
+        for(i = 0; i < nd0; i++)
+        {
+            Device d(dlist[i]);
+            if( !d.available() || !d.compilerAvailable() )
+                continue;
+            if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
+                continue;
+            if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
+                continue;
+            String name = d.name();
+            if( nd != 0 && name != name0 )
+                continue;
+            name0 = name;
+            dlist_new[nd++] = dlist[i];
+        }
+
+        if(nd == 0)
+            return;
+
+        // !!! in the current implementation force the number of devices to 1 !!!
+        nd = 1;
+
+        handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
+        bool ok = handle != 0 && retval >= 0;
+        if( ok )
+        {
+            devices.resize(nd);
+            for( i = 0; i < nd; i++ )
+                devices[i].set(dlist_new[i]);
+        }
+    }
+
+    ~Impl()
+    {
+        if(handle)
+            clReleaseContext(handle);
+        devices.clear();
+    }
+
+    Program getProg(const ProgramSource& src,
+                    const String& buildflags, String& errmsg)
+    {
+        String prefix = Program::getPrefix(buildflags);
+        HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
+        phash_t::iterator it = phash.find(k);
+        if( it != phash.end() )
+            return it->second;
+        //String filename = format("%08x%08x_%08x%08x.clb2",
+        Program prog(src, buildflags, errmsg);
+        phash.insert(std::pair<HashKey,Program>(k, prog));
+        return prog;
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_context handle;
+    std::vector<Device> devices;
+    int dtype;
+    bool initialized;
+
+    typedef ProgramSource::hash_t hash_t;
+
+    struct HashKey
+    {
+        HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
+        bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
+        bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
+        bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
+        hash_t a, b;
+    };
+    typedef std::map<HashKey, Program> phash_t;
+    phash_t phash;
+};
+
+
+Context::Context()
+{
+    p = 0;
+}
+
+Context::Context(int dtype)
+{
+    p = 0;
+    create(dtype);
+}
+
+int Context::dtype() const
+{
+    return p ? p->dtype : 0;
+}
+
+bool Context::create(int dtype0)
+{
+    if( !haveOpenCL() )
+        return false;
+    if(p)
+        p->release();
+    p = new Impl(dtype0);
+    if(!p->handle)
+    {
+        delete p;
+        p = 0;
+    }
+    return p != 0;
+}
+
+Context::~Context()
+{
+    p->release();
+}
+
+Context::Context(const Context& c)
+{
+    p = (Impl*)c.p;
+    if(p)
+        p->addref();
+}
+
+Context& Context::operator = (const Context& c)
+{
+    Impl* newp = (Impl*)c.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+void* Context::ptr() const
+{
+    return p->handle;
+}
+
+size_t Context::ndevices() const
+{
+    return p ? p->devices.size() : 0;
+}
+
+const Device& Context::device(size_t idx) const
+{
+    static Device dummy;
+    return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
+}
+
+Context& Context::getDefault()
+{
+    static Context ctx;
+    if( !ctx.p->handle && haveOpenCL() )
+    {
+        // do not create new Context right away.
+        // First, try to retrieve existing context of the same type.
+        // In its turn, Platform::getContext() may call Context::create()
+        // if there is no such context.
+        ctx.create(Device::TYPE_ACCELERATOR);
+        if(!ctx.p->handle)
+            ctx.create(Device::TYPE_DGPU);
+        if(!ctx.p->handle)
+            ctx.create(Device::TYPE_IGPU);
+        if(!ctx.p->handle)
+            ctx.create(Device::TYPE_CPU);
+    }
+
+    return ctx;
+}
+
+Program Context::getProg(const ProgramSource& prog,
+                         const String& buildopts, String& errmsg)
+{
+    return p ? p->getProg(prog, buildopts, errmsg) : Program();
+}
+
+struct Queue::Impl
+{
+    Impl(const Context& c, const Device& d)
+    {
+        refcount = 1;
+        const Context* pc = &c;
+        cl_context ch = (cl_context)pc->ptr();
+        if( !ch )
+        {
+            pc = &Context::getDefault();
+            ch = (cl_context)pc->ptr();
+        }
+        cl_device_id dh = (cl_device_id)d.ptr();
+        if( !dh )
+            dh = (cl_device_id)pc->device(0).ptr();
+        cl_int retval = 0;
+        handle = clCreateCommandQueue(ch, dh, 0, &retval);
+    }
+
+    ~Impl()
+    {
+        if(handle)
+        {
+            clFinish(handle);
+            clReleaseCommandQueue(handle);
+        }
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_command_queue handle;
+    bool initialized;
+};
+
+Queue::Queue()
+{
+    p = 0;
+}
+
+Queue::Queue(const Context& c, const Device& d)
+{
+    p = 0;
+    create(c, d);
+}
+
+Queue::Queue(const Queue& q)
+{
+    p = q.p;
+    if(p)
+        p->addref();
+}
+
+Queue& Queue::operator = (const Queue& q)
+{
+    Impl* newp = (Impl*)q.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+Queue::~Queue()
+{
+    if(p)
+        p->release();
+}
+
+bool Queue::create(const Context& c, const Device& d)
+{
+    if(p)
+        p->release();
+    p = new Impl(c, d);
+    return p->handle != 0;
+}
+
+void Queue::finish()
+{
+    if(p && p->handle)
+        clFinish(p->handle);
+}
+
+void* Queue::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
+Queue& Queue::getDefault()
+{
+    Queue& q = TLSData::get()->oclQueue;
+    if( !q.p->handle )
+        q.create(Context::getDefault());
+    return q;
+}
+
+static cl_command_queue getQueue(const Queue& q)
+{
+    cl_command_queue qq = (cl_command_queue)q.ptr();
+    if(!qq)
+        qq = (cl_command_queue)Queue::getDefault().ptr();
+    return qq;
+}
+
+KernelArg::KernelArg(int _flags, UMat* _m, void* _obj, size_t _sz)
+    : flags(_flags), m(_m), obj(_obj), sz(_sz)
+{
+}
+
+KernelArg KernelArg::Constant(const Mat& m)
+{
+    CV_Assert(m.isContinuous());
+    return KernelArg(CONSTANT, 0, m.data, m.total()*m.elemSize());
+}
+
+
+struct Kernel::Impl
+{
+    Impl(const char* kname, const Program& prog)
+    {
+        e = 0; refcount = 1;
+        cl_program ph = (cl_program)prog.ptr();
+        cl_int retval = 0;
+        handle = ph != 0 ?
+            clCreateKernel(ph, kname, &retval) : 0;
+    }
+    void finit()
+    {
+        if(!f.empty()) f->operator()();
+        if(e) { clReleaseEvent(e); e = 0; }
+        release();
+    }
+
+    ~Impl()
+    {
+        if(handle)
+            clReleaseKernel(handle);
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    cl_kernel handle;
+    cl_event e;
+    Ptr<Kernel::Callback> f;
+};
+
+}}
+
+extern "C"
+{
+static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
+{
+    ((cv::ocl::Kernel::Impl*)p)->finit();
+}
+
+}
+
+namespace cv { namespace ocl {
+
+Kernel::Kernel()
+{
+    p = 0;
+}
+
+Kernel::Kernel(const char* kname, const Program& prog)
+{
+    p = 0;
+    create(kname, prog);
+}
+
+Kernel::Kernel(const char* kname, const ProgramSource& src,
+               const String& buildopts, String& errmsg)
+{
+    p = 0;
+    create(kname, src, buildopts, errmsg);
+}
+
+Kernel::Kernel(const Kernel& k)
+{
+    p = k.p;
+    if(p)
+        p->addref();
+}
+
+Kernel& Kernel::operator = (const Kernel& k)
+{
+    Impl* newp = (Impl*)k.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+Kernel::~Kernel()
+{
+    if(p)
+        p->release();
+}
+
+bool Kernel::create(const char* kname, const Program& prog)
+{
+    if(p)
+        p->release();
+    p = new Impl(kname, prog);
+    if(p->handle == 0)
+    {
+        p->release();
+        p = 0;
+    }
+    return p != 0;
+}
+
+bool Kernel::create(const char* kname, const ProgramSource& src,
+                    const String& buildopts, String& errmsg)
+{
+    if(p)
+    {
+        p->release();
+        p = 0;
+    }
+    const Program& prog = Context::getDefault().getProg(src, buildopts, errmsg);
+    return create(kname, prog);
+}
+
+void* Kernel::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
+int Kernel::set(int i, const void* value, size_t sz)
+{
+    CV_Assert( p && clSetKernelArg(p->handle, (cl_uint)i, sz, value) >= 0 );
+    return i+1;
+}
+
+int Kernel::set(int i, const UMat& m)
+{
+    return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
+}
+
+int Kernel::set(int i, const KernelArg& arg)
+{
+    CV_Assert( p && p->handle );
+    if( arg.m )
+    {
+        int dims = arg.m->dims;
+        void* h = arg.m->handle(((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
+                                ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0));
+        clSetKernelArg(p->handle, (cl_uint)i, sizeof(cl_mem), &h);
+        clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(size_t), &arg.m->offset);
+        if( dims <= 2 )
+        {
+            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(size_t), &arg.m->step.p[0]);
+            clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(arg.m->rows), &arg.m->rows);
+            clSetKernelArg(p->handle, (cl_uint)(i+4), sizeof(arg.m->cols), &arg.m->cols);
+            return i + 5;
+        }
+        else
+        {
+            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(size_t), &arg.m->offset);
+            clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(size_t)*(dims-1), &arg.m->step.p[0]);
+            clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(cl_int)*dims, &arg.m->size.p[0]);
+            return i + 4;
+        }
+    }
+    else
+    {
+        clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
+        return i+1;
+    }
+}
+
+
+void Kernel::run(int dims, size_t offset[], size_t globalsize[], size_t localsize[],
+                 bool sync, const Ptr<Callback>& cleanupCallback, const Queue& q)
+{
+    CV_Assert(p && p->handle && p->e == 0);
+    cl_command_queue qq = getQueue(q);
+    clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
+                           offset, globalsize, localsize, 0, 0,
+                           sync ? 0 : &p->e);
+    if( sync )
+    {
+        clFinish(qq);
+        if( !cleanupCallback.empty() )
+            cleanupCallback->operator()();
+    }
+    else
+    {
+        p->f = cleanupCallback;
+        p->addref();
+        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
+    }
+}
+
+void Kernel::runTask(bool sync, const Ptr<Callback>& cleanupCallback, const Queue& q)
+{
+    CV_Assert(p && p->handle && p->e == 0);
+    cl_command_queue qq = getQueue(q);
+    clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
+    if( sync )
+    {
+        clFinish(qq);
+        if( !cleanupCallback.empty() )
+            cleanupCallback->operator()();
+    }
+    else
+    {
+        p->f = cleanupCallback;
+        p->addref();
+        clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p);
+    }
+}
+
+
+size_t Kernel::workGroupSize() const
+{
+    if(!p)
+        return 0;
+    size_t val = 0, retsz = 0;
+    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
+    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
+                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
+}
+
+bool Kernel::compileWorkGroupSize(size_t wsz[]) const
+{
+    if(!p || !wsz)
+        return 0;
+    size_t retsz = 0;
+    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
+    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
+                                    sizeof(wsz[0]*3), wsz, &retsz) >= 0;
+}
+
+size_t Kernel::localMemSize() const
+{
+    if(!p)
+        return 0;
+    size_t retsz = 0;
+    cl_ulong val = 0;
+    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
+    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
+                                    sizeof(val), &val, &retsz) >= 0 ? (size_t)val : 0;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+struct Program::Impl
+{
+    Impl(const ProgramSource& _src,
+         const String& _buildflags, String& errmsg)
+    {
+        refcount = 1;
+        const Context& ctx = Context::getDefault();
+        src = _src;
+        buildflags = _buildflags;
+        const String& srcstr = src.source();
+        const char* srcptr = srcstr.c_str();
+        size_t srclen = srcstr.size();
+        cl_int retval = 0;
+
+        handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
+        if( handle && retval >= 0 )
+        {
+            int i, n = ctx.ndevices();
+            AutoBuffer<void*> deviceListBuf(n+1);
+            void** deviceList = deviceListBuf;
+            for( i = 0; i < n; i++ )
+                deviceList[i] = ctx.device(i).ptr();
+            retval = clBuildProgram(handle, n,
+                                    (const cl_device_id*)deviceList,
+                                    buildflags.c_str(), 0, 0);
+            if( retval == CL_BUILD_PROGRAM_FAILURE )
+            {
+                char buf[1024];
+                size_t retsz = 0;
+                clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], CL_PROGRAM_BUILD_LOG,
+                                      sizeof(buf)-16, buf, &retsz);
+                errmsg = String(buf);
+            }
+        }
+    }
+
+    Impl(const String& _buf, const String& _buildflags)
+    {
+        refcount = 1;
+        handle = 0;
+        buildflags = _buildflags;
+        if(_buf.empty())
+            return;
+        String prefix0 = Program::getPrefix(buildflags);
+        const Context& ctx = Context::getDefault();
+        const Device& dev = Device::getDefault();
+        const char* pos0 = _buf.c_str();
+        char* pos1 = strchr(pos0, '\n');
+        if(!pos1)
+            return;
+        char* pos2 = strchr(pos1+1, '\n');
+        if(!pos2)
+            return;
+        char* pos3 = strchr(pos2+1, '\n');
+        if(!pos3)
+            return;
+        size_t prefixlen = (pos3 - pos0)+1;
+        String prefix(pos0, prefixlen);
+        if( prefix != prefix0 )
+            return;
+        const uchar* bin = (uchar*)(pos3+1);
+        void* devid = dev.ptr();
+        size_t codelen = _buf.length() - prefixlen;
+        cl_int binstatus = 0, retval = 0;
+        handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
+                                           &codelen, &bin, &binstatus, &retval);
+    }
+
+    String store()
+    {
+        if(!handle)
+            return String();
+        size_t progsz = 0, retsz = 0;
+        String prefix = Program::getPrefix(buildflags);
+        size_t prefixlen = prefix.length();
+        if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) < 0)
+            return String();
+        AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
+        uchar* buf = bufbuf;
+        memcpy(buf, prefix.c_str(), prefixlen);
+        buf += prefixlen;
+        if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) < 0)
+            return String();
+        buf[progsz] = (uchar)'\0';
+        return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
+    }
+
+    ~Impl()
+    {
+        if( handle )
+            clReleaseProgram(handle);
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+
+    ProgramSource src;
+    String buildflags;
+    cl_program handle;
+};
+
+
+Program::Program() { p = 0; }
+
+Program::Program(const ProgramSource& src,
+        const String& buildflags, String& errmsg)
+{
+    p = 0;
+    create(src, buildflags, errmsg);
+}
+
+Program::Program(const Program& prog)
+{
+    p = prog.p;
+    if(p)
+        p->addref();
+}
+
+Program& Program::operator = (const Program& prog)
+{
+    Impl* newp = (Impl*)prog.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+Program::~Program()
+{
+    if(p)
+        p->release();
+}
+
+bool Program::create(const ProgramSource& src,
+            const String& buildflags, String& errmsg)
+{
+    if(p)
+        p->release();
+    p = new Impl(src, buildflags, errmsg);
+    if(!p->handle)
+    {
+        p->release();
+        p = 0;
+    }
+    return p != 0;
+}
+
+const ProgramSource& Program::source() const
+{
+    static ProgramSource dummy;
+    return p ? p->src : dummy;
+}
+
+void* Program::ptr() const
+{
+    return p ? p->handle : 0;
+}
+
+bool Program::read(const String& bin, const String& buildflags)
+{
+    if(p)
+        p->release();
+    p = new Impl(bin, buildflags);
+    return p->handle != 0;
+}
+
+bool Program::write(String& bin) const
+{
+    if(!p)
+        return false;
+    bin = p->store();
+    return !bin.empty();
+}
+
+String Program::getPrefix() const
+{
+    if(!p)
+        return String();
+    return getPrefix(p->buildflags);
+}
+
+String Program::getPrefix(const String& buildflags)
+{
+    const Context& ctx = Context::getDefault();
+    const Device& dev = ctx.device(0);
+    return format("name=%s\ndriver=%s\nbuildflags=%s\n",
+                  dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+
+struct ProgramSource::Impl
+{
+    Impl(const char* _src)
+    {
+        init(String(_src));
+    }
+    Impl(const String& _src)
+    {
+        init(_src);
+    }
+    void init(const String& _src)
+    {
+        refcount = 1;
+        src = _src;
+        h = crc64((uchar*)src.c_str(), src.size());
+    }
+
+    IMPLEMENT_REFCOUNTABLE();
+    String src;
+    ProgramSource::hash_t h;
+};
+
+
+ProgramSource::ProgramSource()
+{
+    p = 0;
+}
+
+ProgramSource::ProgramSource(const char* prog)
+{
+    p = new Impl(prog);
+}
+
+ProgramSource::ProgramSource(const String& prog)
+{
+    p = new Impl(prog);
+}
+
+ProgramSource::~ProgramSource()
+{
+    if(p)
+        p->release();
+}
+
+ProgramSource::ProgramSource(const ProgramSource& prog)
+{
+    p = prog.p;
+    if(p)
+        p->addref();
+}
+
+ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
+{
+    Impl* newp = (Impl*)prog.p;
+    if(newp)
+        newp->addref();
+    if(p)
+        p->release();
+    p = newp;
+    return *this;
+}
+
+const String& ProgramSource::source() const
+{
+    static String dummy;
+    return p ? p->src : dummy;
+}
+
+ProgramSource::hash_t ProgramSource::hash() const
+{
+    return p ? p->h : 0;
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+
+class OpenCLAllocator : public MatAllocator
+{
+public:
+    OpenCLAllocator() {}
+
+    UMatData* defaultAllocate(int dims, const int* sizes, int type, size_t* step) const
+    {
+        UMatData* u = Mat::getStdAllocator()->allocate(dims, sizes, type, step);
+        u->urefcount = 1;
+        u->refcount = 0;
+        return u;
+    }
+
+    void getBestFlags(const Context& ctx, int& createFlags, int& flags0) const
+    {
+        const Device& dev = ctx.device(0);
+        createFlags = CL_MEM_READ_WRITE;
+
+        if( dev.hostUnifiedMemory() )
+            flags0 = 0;
+        else
+            flags0 = UMatData::COPY_ON_MAP;
+    }
+
+    UMatData* allocate(int dims, const int* sizes, int type, size_t* step) const
+    {
+        if(!useOpenCL())
+            return defaultAllocate(dims, sizes, type, step);
+        size_t total = CV_ELEM_SIZE(type);
+        for( int i = dims-1; i >= 0; i-- )
+        {
+            if( step )
+                step[i] = total;
+            total *= sizes[i];
+        }
+
+        Context& ctx = Context::getDefault();
+        int createFlags = 0, flags0 = 0;
+        getBestFlags(ctx, createFlags, flags0);
+
+        cl_int retval = 0;
+        void* handle = clCreateBuffer((cl_context)ctx.ptr(),
+                                      createFlags, total, 0, &retval);
+        if( !handle || retval < 0 )
+            return defaultAllocate(dims, sizes, type, step);
+        UMatData* u = new UMatData(this);
+        u->data = 0;
+        u->size = total;
+        u->handle = handle;
+        u->urefcount = 1;
+        u->flags = flags0;
+
+        return u;
+    }
+
+    bool allocate(UMatData* u, int accessFlags) const
+    {
+        if(!u)
+            return false;
+
+        UMatDataAutoLock lock(u);
+
+        if(u->handle == 0)
+        {
+            CV_Assert(u->origdata != 0);
+            Context& ctx = Context::getDefault();
+            int createFlags = 0, flags0 = 0;
+            getBestFlags(ctx, createFlags, flags0);
+
+            cl_context ctx_handle = (cl_context)ctx.ptr();
+            cl_int retval = 0;
+            int tempUMatFlags = UMatData::TEMP_UMAT;
+            u->handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
+                                       u->size, u->origdata, &retval);
+            if((!u->handle || retval < 0) && !(accessFlags & ACCESS_FAST))
+            {
+                u->handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|createFlags,
+                                           u->size, u->origdata, &retval);
+                tempUMatFlags = UMatData::TEMP_COPIED_UMAT;
+            }
+            if(!u->handle || retval < 0)
+                return false;
+            u->prevAllocator = u->currAllocator;
+            u->currAllocator = this;
+            u->flags |= tempUMatFlags;
+        }
+        if(accessFlags & ACCESS_WRITE)
+            u->markHostCopyObsolete(true);
+        CV_XADD(&u->urefcount, 1);
+        return true;
+    }
+
+    void deallocate(UMatData* u) const
+    {
+        if(!u)
+            return;
+
+        // TODO: !!! when we add Shared Virtual Memory Support,
+        // this function (as well as the others should be corrected)
+        CV_Assert(u->handle != 0 && u->urefcount == 0);
+        if(u->tempUMat())
+        {
+            if( u->hostCopyObsolete() && u->refcount > 0 && u->tempCopiedUMat() )
+            {
+                clEnqueueWriteBuffer((cl_command_queue)Queue::getDefault().ptr(),
+                                     (cl_mem)u->handle, CL_TRUE, 0,
+                                     u->size, u->origdata, 0, 0, 0);
+            }
+            u->markHostCopyObsolete(false);
+            clReleaseMemObject((cl_mem)u->handle);
+            u->currAllocator = u->prevAllocator;
+            if(u->refcount == 0)
+                u->currAllocator->deallocate(u);
+        }
+        else
+        {
+            if(u->data && u->copyOnMap())
+                fastFree(u->data);
+            clReleaseMemObject((cl_mem)u->handle);
+            delete u;
+        }
+    }
+
+    void map(UMatData* u, int accessFlags) const
+    {
+        if(!u)
+            return;
+
+        CV_Assert( u->handle != 0 );
+
+        UMatDataAutoLock autolock(u);
+
+        if(accessFlags & ACCESS_WRITE)
+            u->markDeviceCopyObsolete(true);
+
+        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+
+        if( u->refcount == 0 )
+        {
+            if( !u->copyOnMap() )
+            {
+                CV_Assert(u->data == 0);
+                // because there can be other map requests for the same UMat with different access flags,
+                // we use the universal (read-write) access mode.
+                cl_int retval = 0;
+                u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                                                     (CL_MAP_READ | CL_MAP_WRITE),
+                                                     0, u->size, 0, 0, 0, &retval);
+                if(u->data && retval >= 0)
+                {
+                    u->markHostCopyObsolete(false);
+                    return;
+                }
+
+                // if map failed, switch to copy-on-map mode for the particular buffer
+                u->flags |= UMatData::COPY_ON_MAP;
+            }
+
+            if(!u->data)
+            {
+                u->data = (uchar*)fastMalloc(u->size);
+                u->markHostCopyObsolete(true);
+            }
+        }
+
+        if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
+        {
+            CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
+                                           u->size, u->data, 0, 0, 0) >= 0 );
+            u->markHostCopyObsolete(false);
+        }
+    }
+
+    void unmap(UMatData* u) const
+    {
+        if(!u)
+            return;
+
+        CV_Assert(u->handle != 0);
+
+        UMatDataAutoLock autolock(u);
+
+        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+        if( !u->copyOnMap() && u->data )
+        {
+            CV_Assert( clEnqueueUnmapMemObject(q, (cl_mem)u->handle, u->data, 0, 0, 0) >= 0 );
+            u->data = 0;
+        }
+        else if( u->copyOnMap() && u->deviceCopyObsolete() )
+        {
+            CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
+                                            u->size, u->data, 0, 0, 0) >= 0 );
+        }
+        u->markDeviceCopyObsolete(false);
+        u->markHostCopyObsolete(false);
+    }
+
+    bool checkContinuous(int dims, const size_t sz[],
+                         const size_t srcofs[], const size_t srcstep[],
+                         const size_t dstofs[], const size_t dststep[],
+                         size_t& total, size_t new_sz[],
+                         size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
+                         size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
+    {
+        bool iscontinuous = true;
+        srcrawofs = srcofs ? srcofs[dims-1] : 0;
+        dstrawofs = dstofs ? dstofs[dims-1] : 0;
+        total = sz[dims-1];
+        for( int i = dims-2; i >= 0; i-- )
+        {
+            if( i > 0 && (total != srcstep[i] || total != dststep[i]) )
+                iscontinuous = false;
+            total *= sz[i];
+            if( srcofs )
+                srcrawofs += srcofs[i]*srcstep[i];
+            if( dstofs )
+                dstrawofs += dstofs[i]*dststep[i];
+        }
+
+        if( !iscontinuous )
+        {
+            // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
+            if( dims == 2 )
+            {
+                new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
+                // we assume that new_... arrays are initialized by caller
+                // with 0's, so there is no else branch
+                if( srcofs )
+                {
+                    new_srcofs[0] = srcofs[1];
+                    new_srcofs[1] = srcofs[0];
+                    new_srcofs[2] = 0;
+                }
+
+                if( dstofs )
+                {
+                    new_dstofs[0] = dstofs[1];
+                    new_dstofs[1] = dstofs[0];
+                    new_dstofs[2] = 0;
+                }
+
+                new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
+                new_dststep[0] = dststep[0]; new_dststep[1] = 0;
+            }
+            else
+            {
+                // we could check for dims == 3 here,
+                // but from user perspective this one is more informative
+                CV_Assert(dims <= 3);
+                new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
+                if( srcofs )
+                {
+                    new_srcofs[0] = srcofs[2];
+                    new_srcofs[1] = srcofs[1];
+                    new_srcofs[2] = srcofs[0];
+                }
+
+                if( dstofs )
+                {
+                    new_dstofs[0] = dstofs[2];
+                    new_dstofs[1] = dstofs[1];
+                    new_dstofs[2] = dstofs[0];
+                }
+
+                new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
+                new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
+            }
+        }
+        return iscontinuous;
+    }
+
+    void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
+                  const size_t srcofs[], const size_t srcstep[],
+                  const size_t dststep[]) const
+    {
+        if(!u)
+            return;
+        UMatDataAutoLock autolock(u);
+
+        if( u->data && !u->hostCopyObsolete() )
+        {
+            Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
+            return;
+        }
+        CV_Assert( u->handle != 0 );
+
+        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+
+        size_t total = 0, new_sz[] = {0, 0, 0};
+        size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
+        size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
+
+        bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
+                                            total, new_sz,
+                                            srcrawofs, new_srcofs, new_srcstep,
+                                            dstrawofs, new_dstofs, new_dststep);
+        if( iscontinuous )
+        {
+            CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
+                                           srcrawofs, total, dstptr, 0, 0, 0) >= 0 );
+        }
+        else
+        {
+            CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
+                            new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
+                            new_dststep[0], new_dststep[1], dstptr, 0, 0, 0) >= 0 );
+        }
+        clFinish(q);
+    }
+
+    void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
+                const size_t dstofs[], const size_t dststep[],
+                const size_t srcstep[]) const
+    {
+        if(!u)
+            return;
+
+        // there should be no user-visible CPU copies of the UMat which we are going to copy to
+        CV_Assert(u->refcount == 0);
+
+        size_t total = 0, new_sz[] = {0, 0, 0};
+        size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
+        size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
+
+        bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
+                                            total, new_sz,
+                                            srcrawofs, new_srcofs, new_srcstep,
+                                            dstrawofs, new_dstofs, new_dststep);
+
+        UMatDataAutoLock autolock(u);
+
+        // if there is cached CPU copy of the GPU matrix,
+        // we could use it as a destination.
+        // we can do it in 2 cases:
+        //    1. we overwrite the whole content
+        //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
+        if( u->data && (u->hostCopyObsolete() <= u->deviceCopyObsolete() || total == u->size))
+        {
+            Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
+            u->markHostCopyObsolete(false);
+            u->markDeviceCopyObsolete(true);
+            return;
+        }
+
+        CV_Assert( u->handle != 0 );
+        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+
+        if( iscontinuous )
+        {
+            CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
+                CL_TRUE, dstrawofs, total, srcptr, 0, 0, 0) >= 0 );
+        }
+        else
+        {
+            CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
+                new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
+                new_srcstep[0], new_srcstep[1], srcptr, 0, 0, 0) >= 0 );
+        }
+
+        u->markHostCopyObsolete(true);
+        u->markDeviceCopyObsolete(false);
+
+        clFinish(q);
+    }
+
+    void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
+              const size_t srcofs[], const size_t srcstep[],
+              const size_t dstofs[], const size_t dststep[], bool sync) const
+    {
+        if(!src || !dst)
+            return;
+
+        size_t total = 0, new_sz[] = {0, 0, 0};
+        size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
+        size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
+
+        bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
+                                            total, new_sz,
+                                            srcrawofs, new_srcofs, new_srcstep,
+                                            dstrawofs, new_dstofs, new_dststep);
+
+        UMatDataAutoLock src_autolock(src);
+        UMatDataAutoLock dst_autolock(dst);
+
+        if( !src->handle || (src->data && src->hostCopyObsolete() <= src->deviceCopyObsolete()) )
+        {
+            upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
+            return;
+        }
+        if( !dst->handle || (dst->data && dst->hostCopyObsolete() <= dst->deviceCopyObsolete()) )
+        {
+            download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
+            dst->markHostCopyObsolete(false);
+            dst->markDeviceCopyObsolete(true);
+            return;
+        }
+
+        // there should be no user-visible CPU copies of the UMat which we are going to copy to
+        CV_Assert(dst->refcount == 0);
+        cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
+
+        if( iscontinuous )
+        {
+            CV_Assert( clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
+                                           srcrawofs, dstrawofs, total, 0, 0, 0) >= 0 );
+        }
+        else
+        {
+            CV_Assert( clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
+                                               new_srcofs, new_dstofs, new_sz,
+                                               new_srcstep[0], new_srcstep[1], new_dststep[0], new_dststep[1],
+                                               0, 0, 0) >= 0 );
+        }
+
+        dst->markHostCopyObsolete(true);
+        dst->markDeviceCopyObsolete(false);
+
+        if( sync )
+            clFinish(q);
+    }
+};
+
+MatAllocator* getOpenCLAllocator()
+{
+    static OpenCLAllocator allocator;
+    return &allocator;
+}
+
+}}
+
diff --git a/modules/core/src/umatrix.cpp b/modules/core/src/umatrix.cpp
new file mode 100644
index 0000000..4ed0e51
--- /dev/null
+++ b/modules/core/src/umatrix.cpp
@@ -0,0 +1,643 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+///////////////////////////////// UMat implementation ///////////////////////////////
+
+namespace cv {
+
+// it should be a prime number for the best hash function
+enum { UMAT_NLOCKS = 31 };
+static Mutex umatLocks[UMAT_NLOCKS];
+
+UMatData::UMatData(const MatAllocator* allocator)
+{
+    prevAllocator = currAllocator = allocator;
+    urefcount = refcount = 0;
+    data = origdata = 0;
+    size = 0;
+    flags = 0;
+    handle = 0;
+}
+
+void UMatData::lock()
+{
+    umatLocks[(size_t)(void*)this % UMAT_NLOCKS].lock();
+}
+
+void UMatData::unlock()
+{
+    umatLocks[(size_t)(void*)this % UMAT_NLOCKS].unlock();
+}
+
+
+MatAllocator* UMat::getStdAllocator()
+{
+    return ocl::getOpenCLAllocator();
+}
+
+void swap( UMat& a, UMat& b )
+{
+    std::swap(a.flags, b.flags);
+    std::swap(a.dims, b.dims);
+    std::swap(a.rows, b.rows);
+    std::swap(a.cols, b.cols);
+    std::swap(a.allocator, b.allocator);
+    std::swap(a.u, b.u);
+    std::swap(a.offset, b.offset);
+
+    std::swap(a.size.p, b.size.p);
+    std::swap(a.step.p, b.step.p);
+    std::swap(a.step.buf[0], b.step.buf[0]);
+    std::swap(a.step.buf[1], b.step.buf[1]);
+
+    if( a.step.p == b.step.buf )
+    {
+        a.step.p = a.step.buf;
+        a.size.p = &a.rows;
+    }
+
+    if( b.step.p == a.step.buf )
+    {
+        b.step.p = b.step.buf;
+        b.size.p = &b.rows;
+    }
+}
+
+
+static inline void setSize( UMat& m, int _dims, const int* _sz,
+                            const size_t* _steps, bool autoSteps=false )
+{
+    CV_Assert( 0 <= _dims && _dims <= CV_MAX_DIM );
+    if( m.dims != _dims )
+    {
+        if( m.step.p != m.step.buf )
+        {
+            fastFree(m.step.p);
+            m.step.p = m.step.buf;
+            m.size.p = &m.rows;
+        }
+        if( _dims > 2 )
+        {
+            m.step.p = (size_t*)fastMalloc(_dims*sizeof(m.step.p[0]) + (_dims+1)*sizeof(m.size.p[0]));
+            m.size.p = (int*)(m.step.p + _dims) + 1;
+            m.size.p[-1] = _dims;
+            m.rows = m.cols = -1;
+        }
+    }
+
+    m.dims = _dims;
+    if( !_sz )
+        return;
+
+    size_t esz = CV_ELEM_SIZE(m.flags), total = esz;
+    int i;
+    for( i = _dims-1; i >= 0; i-- )
+    {
+        int s = _sz[i];
+        CV_Assert( s >= 0 );
+        m.size.p[i] = s;
+
+        if( _steps )
+            m.step.p[i] = i < _dims-1 ? _steps[i] : esz;
+        else if( autoSteps )
+        {
+            m.step.p[i] = total;
+            int64 total1 = (int64)total*s;
+            if( (uint64)total1 != (size_t)total1 )
+                CV_Error( CV_StsOutOfRange, "The total matrix size does not fit to \"size_t\" type" );
+            total = (size_t)total1;
+        }
+    }
+
+    if( _dims == 1 )
+    {
+        m.dims = 2;
+        m.cols = 1;
+        m.step[1] = esz;
+    }
+}
+
+static void updateContinuityFlag(UMat& m)
+{
+    int i, j;
+    for( i = 0; i < m.dims; i++ )
+    {
+        if( m.size[i] > 1 )
+            break;
+    }
+
+    for( j = m.dims-1; j > i; j-- )
+    {
+        if( m.step[j]*m.size[j] < m.step[j-1] )
+            break;
+    }
+
+    uint64 t = (uint64)m.step[0]*m.size[0];
+    if( j <= i && t == (size_t)t )
+        m.flags |= UMat::CONTINUOUS_FLAG;
+    else
+        m.flags &= ~UMat::CONTINUOUS_FLAG;
+}
+
+
+static void finalizeHdr(UMat& m)
+{
+    updateContinuityFlag(m);
+    int d = m.dims;
+    if( d > 2 )
+        m.rows = m.cols = -1;
+}
+
+
+UMat Mat::getUMat(int accessFlags) const
+{
+    UMat hdr;
+    if(!u)
+        return hdr;
+    UMat::getStdAllocator()->allocate(u, accessFlags);
+    setSize(hdr, dims, size.p, step.p);
+    finalizeHdr(hdr);
+    hdr.u = u;
+    hdr.offset = data - datastart;
+    return hdr;
+}
+
+void UMat::create(int d, const int* _sizes, int _type)
+{
+    int i;
+    CV_Assert(0 <= d && d <= CV_MAX_DIM && _sizes);
+    _type = CV_MAT_TYPE(_type);
+
+    if( u && (d == dims || (d == 1 && dims <= 2)) && _type == type() )
+    {
+        if( d == 2 && rows == _sizes[0] && cols == _sizes[1] )
+            return;
+        for( i = 0; i < d; i++ )
+            if( size[i] != _sizes[i] )
+                break;
+        if( i == d && (d > 1 || size[1] == 1))
+            return;
+    }
+
+    release();
+    if( d == 0 )
+        return;
+    flags = (_type & CV_MAT_TYPE_MASK) | MAGIC_VAL;
+    setSize(*this, d, _sizes, 0, true);
+    offset = 0;
+
+    if( total() > 0 )
+    {
+        MatAllocator *a = allocator, *a0 = getStdAllocator();
+        if(!a)
+            a = a0;
+        try
+        {
+            u = a->allocate(dims, size, _type, step.p);
+            CV_Assert(u != 0);
+        }
+        catch(...)
+        {
+            if(a != a0)
+                u = a0->allocate(dims, size, _type, step.p);
+            CV_Assert(u != 0);
+        }
+        CV_Assert( step[dims-1] == (size_t)CV_ELEM_SIZE(flags) );
+    }
+
+    finalizeHdr(*this);
+}
+
+void UMat::copySize(const UMat& m)
+{
+    setSize(*this, m.dims, 0, 0);
+    for( int i = 0; i < dims; i++ )
+    {
+        size[i] = m.size[i];
+        step[i] = m.step[i];
+    }
+}
+
+void UMat::deallocate()
+{
+    u->currAllocator->deallocate(u);
+}
+
+
+UMat::UMat(const UMat& m, const Range& _rowRange, const Range& _colRange)
+    : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+{
+    CV_Assert( m.dims >= 2 );
+    if( m.dims > 2 )
+    {
+        AutoBuffer<Range> rs(m.dims);
+        rs[0] = _rowRange;
+        rs[1] = _colRange;
+        for( int i = 2; i < m.dims; i++ )
+            rs[i] = Range::all();
+        *this = m(rs);
+        return;
+    }
+
+    *this = m;
+    if( _rowRange != Range::all() && _rowRange != Range(0,rows) )
+    {
+        CV_Assert( 0 <= _rowRange.start && _rowRange.start <= _rowRange.end && _rowRange.end <= m.rows );
+        rows = _rowRange.size();
+        offset += step*_rowRange.start;
+        flags |= SUBMATRIX_FLAG;
+    }
+
+    if( _colRange != Range::all() && _colRange != Range(0,cols) )
+    {
+        CV_Assert( 0 <= _colRange.start && _colRange.start <= _colRange.end && _colRange.end <= m.cols );
+        cols = _colRange.size();
+        offset += _colRange.start*elemSize();
+        flags &= cols < m.cols ? ~CONTINUOUS_FLAG : -1;
+        flags |= SUBMATRIX_FLAG;
+    }
+
+    if( rows == 1 )
+        flags |= CONTINUOUS_FLAG;
+
+    if( rows <= 0 || cols <= 0 )
+    {
+        release();
+        rows = cols = 0;
+    }
+}
+
+
+UMat::UMat(const UMat& m, const Rect& roi)
+    : flags(m.flags), dims(2), rows(roi.height), cols(roi.width),
+    allocator(m.allocator), u(m.u), offset(m.offset + roi.y*m.step[0]), size(&rows)
+{
+    CV_Assert( m.dims <= 2 );
+    flags &= roi.width < m.cols ? ~CONTINUOUS_FLAG : -1;
+    flags |= roi.height == 1 ? CONTINUOUS_FLAG : 0;
+
+    size_t esz = CV_ELEM_SIZE(flags);
+    offset += roi.x*esz;
+    CV_Assert( 0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols &&
+              0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows );
+    if( u )
+        CV_XADD(&(u->urefcount), 1);
+    if( roi.width < m.cols || roi.height < m.rows )
+        flags |= SUBMATRIX_FLAG;
+
+    step[0] = m.step[0]; step[1] = esz;
+
+    if( rows <= 0 || cols <= 0 )
+    {
+        release();
+        rows = cols = 0;
+    }
+}
+
+
+UMat::UMat(const UMat& m, const Range* ranges)
+    : flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), u(0), offset(0), size(&rows)
+{
+    int i, d = m.dims;
+
+    CV_Assert(ranges);
+    for( i = 0; i < d; i++ )
+    {
+        Range r = ranges[i];
+        CV_Assert( r == Range::all() || (0 <= r.start && r.start < r.end && r.end <= m.size[i]) );
+    }
+    *this = m;
+    for( i = 0; i < d; i++ )
+    {
+        Range r = ranges[i];
+        if( r != Range::all() && r != Range(0, size.p[i]))
+        {
+            size.p[i] = r.end - r.start;
+            offset += r.start*step.p[i];
+            flags |= SUBMATRIX_FLAG;
+        }
+    }
+    updateContinuityFlag(*this);
+}
+
+UMat UMat::diag(int d) const
+{
+    CV_Assert( dims <= 2 );
+    UMat m = *this;
+    size_t esz = elemSize();
+    int len;
+
+    if( d >= 0 )
+    {
+        len = std::min(cols - d, rows);
+        m.offset += esz*d;
+    }
+    else
+    {
+        len = std::min(rows + d, cols);
+        m.offset -= step[0]*d;
+    }
+    CV_DbgAssert( len > 0 );
+
+    m.size[0] = m.rows = len;
+    m.size[1] = m.cols = 1;
+    m.step[0] += (len > 1 ? esz : 0);
+
+    if( m.rows > 1 )
+        m.flags &= ~CONTINUOUS_FLAG;
+    else
+        m.flags |= CONTINUOUS_FLAG;
+
+    if( size() != Size(1,1) )
+        m.flags |= SUBMATRIX_FLAG;
+
+    return m;
+}
+
+void UMat::locateROI( Size& wholeSize, Point& ofs ) const
+{
+    CV_Assert( dims <= 2 && step[0] > 0 );
+    size_t esz = elemSize(), minstep;
+    ptrdiff_t delta1 = (ptrdiff_t)offset, delta2 = (ptrdiff_t)u->size;
+
+    if( delta1 == 0 )
+        ofs.x = ofs.y = 0;
+    else
+    {
+        ofs.y = (int)(delta1/step[0]);
+        ofs.x = (int)((delta1 - step[0]*ofs.y)/esz);
+        CV_DbgAssert( offset == (size_t)(ofs.y*step[0] + ofs.x*esz) );
+    }
+    minstep = (ofs.x + cols)*esz;
+    wholeSize.height = (int)((delta2 - minstep)/step[0] + 1);
+    wholeSize.height = std::max(wholeSize.height, ofs.y + rows);
+    wholeSize.width = (int)((delta2 - step*(wholeSize.height-1))/esz);
+    wholeSize.width = std::max(wholeSize.width, ofs.x + cols);
+}
+
+
+UMat& UMat::adjustROI( int dtop, int dbottom, int dleft, int dright )
+{
+    CV_Assert( dims <= 2 && step[0] > 0 );
+    Size wholeSize; Point ofs;
+    size_t esz = elemSize();
+    locateROI( wholeSize, ofs );
+    int row1 = std::max(ofs.y - dtop, 0), row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
+    int col1 = std::max(ofs.x - dleft, 0), col2 = std::min(ofs.x + cols + dright, wholeSize.width);
+    offset += (row1 - ofs.y)*step + (col1 - ofs.x)*esz;
+    rows = row2 - row1; cols = col2 - col1;
+    size.p[0] = rows; size.p[1] = cols;
+    if( esz*cols == step[0] || rows == 1 )
+        flags |= CONTINUOUS_FLAG;
+    else
+        flags &= ~CONTINUOUS_FLAG;
+    return *this;
+}
+
+
+UMat UMat::reshape(int new_cn, int new_rows) const
+{
+    int cn = channels();
+    UMat hdr = *this;
+
+    if( dims > 2 && new_rows == 0 && new_cn != 0 && size[dims-1]*cn % new_cn == 0 )
+    {
+        hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn-1) << CV_CN_SHIFT);
+        hdr.step[dims-1] = CV_ELEM_SIZE(hdr.flags);
+        hdr.size[dims-1] = hdr.size[dims-1]*cn / new_cn;
+        return hdr;
+    }
+
+    CV_Assert( dims <= 2 );
+
+    if( new_cn == 0 )
+        new_cn = cn;
+
+    int total_width = cols * cn;
+
+    if( (new_cn > total_width || total_width % new_cn != 0) && new_rows == 0 )
+        new_rows = rows * total_width / new_cn;
+
+    if( new_rows != 0 && new_rows != rows )
+    {
+        int total_size = total_width * rows;
+        if( !isContinuous() )
+            CV_Error( CV_BadStep,
+            "The matrix is not continuous, thus its number of rows can not be changed" );
+
+        if( (unsigned)new_rows > (unsigned)total_size )
+            CV_Error( CV_StsOutOfRange, "Bad new number of rows" );
+
+        total_width = total_size / new_rows;
+
+        if( total_width * new_rows != total_size )
+            CV_Error( CV_StsBadArg, "The total number of matrix elements "
+                                    "is not divisible by the new number of rows" );
+
+        hdr.rows = new_rows;
+        hdr.step[0] = total_width * elemSize1();
+    }
+
+    int new_width = total_width / new_cn;
+
+    if( new_width * new_cn != total_width )
+        CV_Error( CV_BadNumChannels,
+        "The total width is not divisible by the new number of channels" );
+
+    hdr.cols = new_width;
+    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn-1) << CV_CN_SHIFT);
+    hdr.step[1] = CV_ELEM_SIZE(hdr.flags);
+    return hdr;
+}
+
+UMat UMat::diag(const UMat& d)
+{
+    CV_Assert( d.cols == 1 || d.rows == 1 );
+    int len = d.rows + d.cols - 1;
+    UMat m(len, len, d.type(), Scalar(0));
+    UMat md = m.diag();
+    if( d.cols == 1 )
+        d.copyTo(md);
+    else
+        transpose(d, md);
+    return m;
+}
+
+int UMat::checkVector(int _elemChannels, int _depth, bool _requireContinuous) const
+{
+    return (depth() == _depth || _depth <= 0) &&
+        (isContinuous() || !_requireContinuous) &&
+        ((dims == 2 && (((rows == 1 || cols == 1) && channels() == _elemChannels) ||
+                        (cols == _elemChannels && channels() == 1))) ||
+        (dims == 3 && channels() == 1 && size.p[2] == _elemChannels && (size.p[0] == 1 || size.p[1] == 1) &&
+         (isContinuous() || step.p[1] == step.p[2]*size.p[2])))
+    ? (int)(total()*channels()/_elemChannels) : -1;
+}
+
+
+UMat UMat::cross(InputArray) const
+{
+    CV_Error(CV_StsNotImplemented, "");
+    return UMat();
+}
+
+
+UMat UMat::reshape(int _cn, int _newndims, const int* _newsz) const
+{
+    if(_newndims == dims)
+    {
+        if(_newsz == 0)
+            return reshape(_cn);
+        if(_newndims == 2)
+            return reshape(_cn, _newsz[0]);
+    }
+
+    CV_Error(CV_StsNotImplemented, "");
+    // TBD
+    return UMat();
+}
+
+
+Mat UMat::getMat(int accessFlags) const
+{
+    if(!u)
+        return Mat();
+    u->currAllocator->map(u, accessFlags);
+    CV_Assert(u->data != 0);
+    Mat hdr(dims, size.p, type(), u->data + offset, step.p);
+    hdr.refcount = &u->refcount;
+    hdr.u = u;
+    hdr.datastart = u->data;
+    hdr.datalimit = hdr.dataend = u->data + u->size;
+    CV_XADD(hdr.refcount, 1);
+    return hdr;
+}
+
+void* UMat::handle(int accessFlags) const
+{
+    if( !u )
+        return 0;
+
+    // check flags: if CPU copy is newer, copy it back to GPU.
+    if( u->deviceCopyObsolete() )
+    {
+        CV_Assert(u->refcount == 0);
+        u->currAllocator->unmap(u);
+    }
+    else if( u->refcount > 0 && (accessFlags & ACCESS_WRITE) )
+    {
+        CV_Error(Error::StsError,
+                 "it's not allowed to access UMat handle for writing "
+                 "while it's mapped; call Mat::release() first for all its mappings");
+    }
+    return u->handle;
+}
+
+void UMat::ndoffset(size_t* ofs) const
+{
+    // offset = step[0]*ofs[0] + step[1]*ofs[1] + step[2]*ofs[2] + ...;
+    size_t t = offset;
+    for( int i = 0; i < dims; i++ )
+    {
+        size_t s = step.p[i];
+        ofs[i] = t / s;
+        t -= ofs[i]*s;
+    }
+}
+
+void UMat::copyTo(OutputArray _dst) const
+{
+    int dtype = _dst.type();
+    if( _dst.fixedType() && dtype != type() )
+    {
+        CV_Assert( channels() == CV_MAT_CN(dtype) );
+        convertTo( _dst, dtype );
+        return;
+    }
+
+    if( empty() )
+    {
+        _dst.release();
+        return;
+    }
+
+    size_t i, sz[CV_MAX_DIM], srcofs[CV_MAX_DIM], dstofs[CV_MAX_DIM];
+    for( i = 0; i < (size_t)dims; i++ )
+        sz[i] = size.p[i];
+    sz[dims-1] *= elemSize();
+    ndoffset(srcofs);
+
+    _dst.create( dims, size, type() );
+    if( _dst.kind() == _InputArray::UMAT )
+    {
+        UMat dst = _dst.getUMat();
+        void* srchandle = handle(ACCESS_READ);
+        void* dsthandle = dst.handle(ACCESS_WRITE);
+        if( srchandle == dsthandle && dst.offset == offset )
+            return;
+        ndoffset(dstofs);
+        CV_Assert(u->currAllocator == dst.u->currAllocator);
+        u->currAllocator->copy(u, dst.u, dims, sz, srcofs, step.p, dstofs, dst.step.p, false);
+    }
+    else
+    {
+        Mat dst = _dst.getMat();
+        u->currAllocator->download(u, dst.data, dims, sz, srcofs, step.p, dst.step.p);
+    }
+}
+
+void UMat::convertTo(OutputArray, int, double, double) const
+{
+    CV_Error(Error::StsNotImplemented, "");
+}
+
+UMat& UMat::operator = (const Scalar&)
+{
+    CV_Error(Error::StsNotImplemented, "");
+    return *this;
+}
+
+}
+
+/* End of file. */