Merge remote-tracking branch 'upstream/3.4' into merge-3.4

[platform/upstream/opencv.git] / modules / core / src / ocl.cpp
diff --git a/modules/core/src/ocl.cpp b/modules/core/src/ocl.cpp

index df5f1e9..53b21e0 100644 (file)
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@@ -113,6 +113,10 @@
  
  #include "opencv2/core/opencl/runtime/opencl_core.hpp"
  
+#ifdef HAVE_DIRECTX
+#include "directx.hpp"
+#endif
+
  #ifdef HAVE_OPENCL_SVM
  #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
  #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
@@ -169,7 +173,7 @@ void traceOpenCLCheck(cl_int status, const char* message)
          CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
          if (check_result != CL_SUCCESS) \
          { \
-            if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
+            static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_CHECK_RESULT must be const char*"); \
              cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
              CV_Error(Error::OpenCLApiCallError, error_msg); \
          } \
@@ -189,7 +193,7 @@ void traceOpenCLCheck(cl_int status, const char* message)
          CV_OCL_TRACE_CHECK_RESULT(check_result, msg); \
          if (check_result != CL_SUCCESS && isRaiseError()) \
          { \
-            if (0) { const char* msg_ = (msg); CV_UNUSED(msg_); /* ensure const char* type (cv::String without c_str()) */ } \
+            static_assert(std::is_convertible<decltype(msg), const char*>::value, "msg of CV_OCL_DBG_CHECK_RESULT must be const char*"); \
              cv::String error_msg = CV_OCL_API_ERROR_MSG(check_result, msg); \
              CV_Error(Error::OpenCLApiCallError, error_msg); \
          } \
@@ -829,6 +833,323 @@ public:
  #endif // OPENCV_HAVE_FILESYSTEM_SUPPORT
  
  
+
+struct OpenCLExecutionContext::Impl
+{
+    ocl::Context context_;
+    int device_;  // device index in context
+    ocl::Queue queue_;
+    int useOpenCL_;
+
+protected:
+    Impl() = delete;
+
+    void _init_device(cl_device_id deviceID)
+    {
+        CV_Assert(deviceID);
+        int ndevices = (int)context_.ndevices();
+        CV_Assert(ndevices > 0);
+        bool found = false;
+        for (int i = 0; i < ndevices; i++)
+        {
+            ocl::Device d = context_.device(i);
+            cl_device_id dhandle = (cl_device_id)d.ptr();
+            if (dhandle == deviceID)
+            {
+                device_ = i;
+                found = true;
+                break;
+            }
+        }
+        CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
+    }
+
+    void _init_device(const ocl::Device& device)
+    {
+        CV_Assert(device.ptr());
+        int ndevices = (int)context_.ndevices();
+        CV_Assert(ndevices > 0);
+        bool found = false;
+        for (int i = 0; i < ndevices; i++)
+        {
+            ocl::Device d = context_.device(i);
+            if (d.getImpl() == device.getImpl())
+            {
+                device_ = i;
+                found = true;
+                break;
+            }
+        }
+        CV_Assert(found && "OpenCL device can't work with passed OpenCL context");
+    }
+
+public:
+    Impl(cl_platform_id platformID, cl_context context, cl_device_id deviceID)
+        : device_(0), useOpenCL_(-1)
+    {
+        CV_UNUSED(platformID);
+        CV_Assert(context);
+        CV_Assert(deviceID);
+
+        context_ = Context::fromHandle(context);
+        _init_device(deviceID);
+        queue_ = Queue(context_, context_.device(device_));
+    }
+
+    Impl(const ocl::Context& context, const ocl::Device& device, const ocl::Queue& queue)
+        : device_(0), useOpenCL_(-1)
+    {
+        CV_Assert(context.ptr());
+        CV_Assert(device.ptr());
+
+        context_ = context;
+        _init_device(device);
+        queue_ = queue;
+    }
+
+    Impl(const ocl::Context& context, const ocl::Device& device)
+        : device_(0), useOpenCL_(-1)
+    {
+        CV_Assert(context.ptr());
+        CV_Assert(device.ptr());
+
+        context_ = context;
+        _init_device(device);
+        queue_ = Queue(context_, context_.device(device_));
+    }
+
+    Impl(const ocl::Context& context, const int device, const ocl::Queue& queue)
+        : context_(context)
+        , device_(device)
+        , queue_(queue)
+        , useOpenCL_(-1)
+    {
+        // nothing
+    }
+    Impl(const Impl& other)
+        : context_(other.context_)
+        , device_(other.device_)
+        , queue_(other.queue_)
+        , useOpenCL_(-1)
+    {
+        // nothing
+    }
+
+    inline bool useOpenCL() const { return const_cast<Impl*>(this)->useOpenCL(); }
+    bool useOpenCL()
+    {
+        if (useOpenCL_ < 0)
+        {
+            try
+            {
+                useOpenCL_ = 0;
+                if (!context_.empty() && context_.ndevices() > 0)
+                {
+                    const Device& d = context_.device(device_);
+                    useOpenCL_ = d.available();
+                }
+            }
+            catch (const cv::Exception&)
+            {
+                // nothing
+            }
+            if (!useOpenCL_)
+                CV_LOG_INFO(NULL, "OpenCL: can't use OpenCL execution context");
+        }
+        return useOpenCL_ > 0;
+    }
+
+    void setUseOpenCL(bool flag)
+    {
+        if (!flag)
+            useOpenCL_ = 0;
+        else
+            useOpenCL_ = -1;
+    }
+
+    static const std::shared_ptr<Impl>& getInitializedExecutionContext()
+    {
+        CV_TRACE_FUNCTION();
+
+        CV_LOG_INFO(NULL, "OpenCL: initializing thread execution context");
+
+        static bool initialized = false;
+        static std::shared_ptr<Impl> g_primaryExecutionContext;
+
+        if (!initialized)
+        {
+            cv::AutoLock lock(getInitializationMutex());
+            if (!initialized)
+            {
+                CV_LOG_INFO(NULL, "OpenCL: creating new execution context...");
+                try
+                {
+                    Context c = ocl::Context::create(std::string());
+                    if (c.ndevices())
+                    {
+                        int deviceId = 0;
+                        auto& d = c.device(deviceId);
+                        if (d.available())
+                        {
+                            auto q = ocl::Queue(c, d);
+                            if (!q.ptr())
+                            {
+                                CV_LOG_ERROR(NULL, "OpenCL: Can't create default OpenCL queue");
+                            }
+                            else
+                            {
+                                g_primaryExecutionContext = std::make_shared<Impl>(c, deviceId, q);
+                                CV_LOG_INFO(NULL, "OpenCL: device=" << d.name());
+                            }
+                        }
+                        else
+                        {
+                            CV_LOG_ERROR(NULL, "OpenCL: OpenCL device is not available (CL_DEVICE_AVAILABLE returns false)");
+                        }
+                    }
+                    else
+                    {
+                        CV_LOG_INFO(NULL, "OpenCL: context is not available/disabled");
+                    }
+                }
+                catch (const std::exception& e)
+                {
+                    CV_LOG_INFO(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: " << e.what());
+                }
+                catch (...)
+                {
+                    CV_LOG_WARNING(NULL, "OpenCL: Can't initialize OpenCL context/device/queue: unknown C++ exception");
+                }
+                initialized = true;
+            }
+        }
+        return g_primaryExecutionContext;
+    }
+};
+
+Context& OpenCLExecutionContext::getContext() const
+{
+    CV_Assert(p);
+    return p->context_;
+}
+Device& OpenCLExecutionContext::getDevice() const
+{
+    CV_Assert(p);
+    return p->context_.device(p->device_);
+}
+Queue& OpenCLExecutionContext::getQueue() const
+{
+    CV_Assert(p);
+    return p->queue_;
+}
+
+bool OpenCLExecutionContext::useOpenCL() const
+{
+    if (p)
+        return p->useOpenCL();
+    return false;
+}
+void OpenCLExecutionContext::setUseOpenCL(bool flag)
+{
+    CV_Assert(p);
+    p->setUseOpenCL(flag);
+}
+
+/* static */
+OpenCLExecutionContext& OpenCLExecutionContext::getCurrent()
+{
+    CV_TRACE_FUNCTION();
+    CoreTLSData& data = getCoreTlsData();
+    OpenCLExecutionContext& c = data.oclExecutionContext;
+    if (!data.oclExecutionContextInitialized)
+    {
+        data.oclExecutionContextInitialized = true;
+        if (c.empty() && haveOpenCL())
+            c.p = Impl::getInitializedExecutionContext();
+    }
+    return c;
+}
+
+/* static */
+OpenCLExecutionContext& OpenCLExecutionContext::getCurrentRef()
+{
+    CV_TRACE_FUNCTION();
+    CoreTLSData& data = getCoreTlsData();
+    OpenCLExecutionContext& c = data.oclExecutionContext;
+    return c;
+}
+
+void OpenCLExecutionContext::bind() const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(p);
+    CoreTLSData& data = getCoreTlsData();
+    data.oclExecutionContext = *this;
+    data.oclExecutionContextInitialized = true;
+    data.useOpenCL = p->useOpenCL_;  // propagate "-1", avoid call useOpenCL()
+}
+
+
+OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue() const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(p);
+    const Queue q(getContext(), getDevice());
+    return cloneWithNewQueue(q);
+}
+
+OpenCLExecutionContext OpenCLExecutionContext::cloneWithNewQueue(const ocl::Queue& q) const
+{
+    CV_TRACE_FUNCTION();
+    CV_Assert(p);
+    CV_Assert(q.ptr() != NULL);
+    OpenCLExecutionContext c;
+    c.p = std::make_shared<Impl>(p->context_, p->device_, q);
+    return c;
+}
+
+/* static */
+OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device, const ocl::Queue& queue)
+{
+    CV_TRACE_FUNCTION();
+    if (!haveOpenCL())
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
+
+    CV_Assert(!context.empty());
+    CV_Assert(context.ptr());
+    CV_Assert(!device.empty());
+    CV_Assert(device.ptr());
+    OpenCLExecutionContext ctx;
+    ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device, queue);
+    return ctx;
+
+}
+
+/* static */
+OpenCLExecutionContext OpenCLExecutionContext::create(const Context& context, const Device& device)
+{
+    CV_TRACE_FUNCTION();
+    if (!haveOpenCL())
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
+
+    CV_Assert(!context.empty());
+    CV_Assert(context.ptr());
+    CV_Assert(!device.empty());
+    CV_Assert(device.ptr());
+    OpenCLExecutionContext ctx;
+    ctx.p = std::make_shared<OpenCLExecutionContext::Impl>(context, device);
+    return ctx;
+
+}
+
+void OpenCLExecutionContext::release()
+{
+    CV_TRACE_FUNCTION();
+    p.reset();
+}
+
+
+
  // true if we have initialized OpenCL subsystem with available platforms
  static bool g_isOpenCLInitialized = false;
  static bool g_isOpenCLAvailable = false;
@@ -847,14 +1168,18 @@ bool haveOpenCL()
              {
                  g_isOpenCLAvailable = false;
                  g_isOpenCLInitialized = true;
+                return false;
              }
          }
+
+        cv::AutoLock lock(getInitializationMutex());
          CV_LOG_INFO(NULL, "Initialize OpenCL runtime...");
          try
          {
              cl_uint n = 0;
              g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
              g_isOpenCLAvailable &= n > 0;
+            CV_LOG_INFO(NULL, "OpenCL: found " << n << " platforms");
          }
          catch (...)
          {
@@ -872,11 +1197,16 @@ bool useOpenCL()
      {
          try
          {
-            data.useOpenCL = (int)(haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available()) ? 1 : 0;
+            data.useOpenCL = 0;
+            if (haveOpenCL())
+            {
+                auto c = OpenCLExecutionContext::getCurrent();
+                data.useOpenCL = c.useOpenCL();
+            }
          }
          catch (...)
          {
-            data.useOpenCL = 0;
+            CV_LOG_INFO(NULL, "OpenCL: can't initialize thread OpenCL execution context");
          }
      }
      return data.useOpenCL > 0;
@@ -894,16 +1224,23 @@ void setUseOpenCL(bool flag)
      CV_TRACE_FUNCTION();
  
      CoreTLSData& data = getCoreTlsData();
-    if (!flag)
+    auto& c = OpenCLExecutionContext::getCurrentRef();
+    if (!c.empty())
      {
-        data.useOpenCL = 0;
+        c.setUseOpenCL(flag);
+        data.useOpenCL = c.useOpenCL();
      }
-    else if( haveOpenCL() )
+    else
      {
-        data.useOpenCL = (Device::getDefault().ptr() != NULL) ? 1 : 0;
+        if (!flag)
+            data.useOpenCL = 0;
+        else
+            data.useOpenCL = -1; // enabled by default (if context is not initialized)
      }
  }
  
+
+
  #ifdef HAVE_CLAMDBLAS
  
  class AmdBlasHelper
@@ -1150,6 +1487,7 @@ void* Platform::ptr() const
  
  Platform& Platform::getDefault()
  {
+    CV_LOG_ONCE_WARNING(NULL, "OpenCL: Platform::getDefault() is deprecated and will be removed. Use cv::ocl::getPlatfomsInfo() for enumeration of available platforms");
      static Platform p;
      if( !p.p )
      {
@@ -1188,9 +1526,24 @@ static void parseOpenCLVersion(const String &version, int &major, int &minor)
  struct Device::Impl
  {
      Impl(void* d)
+        : refcount(1)
+        , handle(0)
+    {
+        try
+        {
+            cl_device_id device = (cl_device_id)d;
+            _init(device);
+            CV_OCL_CHECK(clRetainDevice(device));  // increment reference counter on success only
+        }
+        catch (...)
+        {
+            throw;
+        }
+    }
+
+    void _init(cl_device_id d)
      {
          handle = (cl_device_id)d;
-        refcount = 1;
  
          name_ = getStrProp(CL_DEVICE_NAME);
          version_ = getStrProp(CL_DEVICE_VERSION);
@@ -1253,6 +1606,20 @@ struct Device::Impl
  #endif
      }
  
+    ~Impl()
+    {
+#ifdef _WIN32
+        if (!cv::__termination)
+#endif
+        {
+            if (handle)
+            {
+                CV_OCL_CHECK(clReleaseDevice(handle));
+                handle = 0;
+            }
+        }
+    }
+
      template<typename _TpCL, typename _TpOut>
      _TpOut getProp(cl_device_info prop) const
      {
@@ -1350,6 +1717,16 @@ void Device::set(void* d)
      if(p)
          p->release();
      p = new Impl(d);
+    if (p->handle)
+    {
+        CV_OCL_CHECK(clReleaseDevice((cl_device_id)d));
+    }
+}
+
+Device Device::fromHandle(void* d)
+{
+    Device device(d);
+    return device;
  }
  
  void* Device::ptr() const
@@ -1612,10 +1989,14 @@ size_t Device::profilingTimerResolution() const
  
  const Device& Device::getDefault()
  {
-    const Context& ctx = Context::getDefault();
-    int idx = getCoreTlsData().device;
-    const Device& device = ctx.device(idx);
-    return device;
+    auto& c = OpenCLExecutionContext::getCurrent();
+    if (!c.empty())
+    {
+        return c.getDevice();
+    }
+
+    static Device dummy;
+    return dummy;
  }
  
  ////////////////////////////////////// Context ///////////////////////////////////////////////////
@@ -1667,7 +2048,7 @@ static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
      split(configurationStr, ':', parts);
      if (parts.size() > 3)
      {
-        std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
+        CV_LOG_ERROR(NULL, "OpenCL: Invalid configuration string for OpenCL device: " << configurationStr);
          return false;
      }
      if (parts.size() > 2)
@@ -1684,22 +2065,20 @@ static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
  }
  
  #if defined WINRT || defined _WIN32_WCE
-static cl_device_id selectOpenCLDevice()
+static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
  {
+    CV_UNUSED(configuration)
      return NULL;
  }
  #else
-// std::tolower is int->int
-static char char_tolower(char ch)
-{
-    return (char)std::tolower((int)ch);
-}
-static cl_device_id selectOpenCLDevice()
+static cl_device_id selectOpenCLDevice(const char* configuration = NULL)
  {
      std::string platform, deviceName;
      std::vector<std::string> deviceTypes;
  
-    const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
+    if (!configuration)
+        configuration = getenv("OPENCV_OPENCL_DEVICE");
+
      if (configuration &&
              (strcmp(configuration, "disabled") == 0 ||
               !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
@@ -1759,7 +2138,7 @@ static cl_device_id selectOpenCLDevice()
          }
          if (selectedPlatform == -1)
          {
-            std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
+            CV_LOG_ERROR(NULL, "OpenCL: Can't find OpenCL platform by name: " << platform);
              goto not_found;
          }
      }
@@ -1778,7 +2157,7 @@ static cl_device_id selectOpenCLDevice()
      {
          int deviceType = 0;
          std::string tempStrDeviceType = deviceTypes[t];
-        std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), char_tolower);
+        std::transform(tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), details::char_tolower);
  
          if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
              deviceType = Device::TYPE_GPU;
@@ -1790,7 +2169,7 @@ static cl_device_id selectOpenCLDevice()
              deviceType = Device::TYPE_ALL;
          else
          {
-            std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
+            CV_LOG_ERROR(NULL, "OpenCL: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t]);
              goto not_found;
          }
  
@@ -1841,13 +2220,16 @@ not_found:
      if (!configuration)
          return NULL; // suppress messages on stderr
  
-    std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << configuration << std::endl
-            << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
-            << "    Device types: ";
+    std::ostringstream msg;
+    msg << "ERROR: Requested OpenCL device not found, check configuration: '" << configuration << "'" << std::endl
+        << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
+        << "    Device types:";
      for (size_t t = 0; t < deviceTypes.size(); t++)
-        std::cerr << deviceTypes[t] << " ";
+        msg << ' ' << deviceTypes[t];
+
+    msg << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName);
  
-    std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
+    CV_LOG_ERROR(NULL, msg.str());
      return NULL;
  }
  #endif
@@ -1928,131 +2310,260 @@ static size_t getProgramCountLimit()
      return count;
  }
  
+static int g_contextId = 0;
+
+class OpenCLBufferPoolImpl;
+class OpenCLSVMBufferPoolImpl;
+
  struct Context::Impl
  {
      static Context::Impl* get(Context& context) { return context.p; }
  
-    void __init()
+    typedef std::deque<Context::Impl*> container_t;
+    static container_t& getGlobalContainer()
      {
-        refcount = 1;
-        handle = 0;
+        // never delete this container (Impl lifetime is greater due to TLS storage)
+        static container_t* g_contexts = new container_t();
+        return *g_contexts;
+    }
+
+protected:
+    Impl(const std::string& configuration_)
+        : refcount(1)
+        , contextId(CV_XADD(&g_contextId, 1))
+        , configuration(configuration_)
+        , handle(0)
+#ifdef HAVE_DIRECTX
+        , p_directx_impl(0)
+#endif
  #ifdef HAVE_OPENCL_SVM
-        svmInitialized = false;
+        , svmInitialized(false)
  #endif
+    {
+        if (!haveOpenCL())
+            CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
+
+        cv::AutoLock lock(cv::getInitializationMutex());
+        auto& container = getGlobalContainer();
+        container.resize(std::max(container.size(), (size_t)contextId + 1));
+        container[contextId] = this;
      }
  
-    Impl()
+    ~Impl()
      {
-        __init();
+#ifdef _WIN32
+        if (!cv::__termination)
+#endif
+        {
+            if (handle)
+            {
+                CV_OCL_DBG_CHECK(clReleaseContext(handle));
+                handle = NULL;
+            }
+            devices.clear();
+#ifdef HAVE_DIRECTX
+            directx::internal::deleteDirectXImpl(&p_directx_impl);
+#endif
+        }
+
+        {
+            cv::AutoLock lock(cv::getInitializationMutex());
+            auto& container = getGlobalContainer();
+            CV_CheckLT((size_t)contextId, container.size(), "");
+            container[contextId] = NULL;
+        }
      }
  
-    void setDefault()
+    void init_device_list()
      {
-        CV_Assert(handle == NULL);
+        CV_Assert(handle);
  
-        cl_device_id d = selectOpenCLDevice();
+        cl_uint ndevices = 0;
+        CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_NUM_DEVICES, sizeof(ndevices), &ndevices, NULL));
+        CV_Assert(ndevices > 0);
  
-        if (d == NULL)
-            return;
+        cv::AutoBuffer<cl_device_id> cl_devices(ndevices);
+        size_t devices_ret_size = 0;
+        CV_OCL_CHECK(clGetContextInfo(handle, CL_CONTEXT_DEVICES, cl_devices.size() * sizeof(cl_device_id), &cl_devices[0], &devices_ret_size));
+        CV_CheckEQ(devices_ret_size, cl_devices.size() * sizeof(cl_device_id), "");
  
-        cl_platform_id pl = NULL;
-        CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
+        devices.clear();
+        for (unsigned i = 0; i < ndevices; i++)
+        {
+            devices.emplace_back(Device::fromHandle(cl_devices[i]));
+        }
+    }
  
-        cl_context_properties prop[] =
+    void __init_buffer_pools();  // w/o synchronization
+    void _init_buffer_pools() const
+    {
+        if (!bufferPool_)
          {
-            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
-            0
-        };
+            cv::AutoLock lock(cv::getInitializationMutex());
+            if (!bufferPool_)
+            {
+                const_cast<Impl*>(this)->__init_buffer_pools();
+            }
+        }
+    }
+public:
+    static Impl* findContext(const std::string& configuration)
+    {
+        CV_TRACE_FUNCTION();
+        cv::AutoLock lock(cv::getInitializationMutex());
+        auto& container = getGlobalContainer();
+        if (configuration.empty() && !container.empty())
+            return container[0];
+        for (auto it = container.begin(); it != container.end(); ++it)
+        {
+            Impl* i = *it;
+            if (i && i->configuration == configuration)
+            {
+                return i;
+            }
+        }
+        return NULL;
+    }
  
-        // !!! in the current implementation force the number of devices to 1 !!!
-        cl_uint nd = 1;
-        cl_int status;
+    static Impl* findOrCreateContext(const std::string& configuration_)
+    {
+        CV_TRACE_FUNCTION();
+        std::string configuration = configuration_;
+        if (configuration_.empty())
+        {
+            const char* c = getenv("OPENCV_OPENCL_DEVICE");
+            if (c)
+                configuration = c;
+        }
+        Impl* impl = findContext(configuration);
+        if (impl)
+        {
+            CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
+            impl->addref();
+            return impl;
+        }
  
-        handle = clCreateContext(prop, nd, &d, 0, 0, &status);
-        CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
+        cl_device_id d = selectOpenCLDevice(configuration.empty() ? NULL : configuration.c_str());
+        if (d == NULL)
+            return NULL;
  
-        bool ok = handle != 0 && status == CL_SUCCESS;
-        if( ok )
+        impl = new Impl(configuration);
+        try
          {
-            devices.resize(nd);
-            devices[0].set(d);
+            impl->createFromDevice(d);
+            if (impl->handle)
+                return impl;
+            delete impl;
+            return NULL;
+        }
+        catch (...)
+        {
+            delete impl;
+            throw;
          }
-        else
-            handle = NULL;
      }
  
-    Impl(int dtype0)
+    static Impl* findOrCreateContext(cl_context h)
      {
-        __init();
+        CV_TRACE_FUNCTION();
  
-        cl_int retval = 0;
-        cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
-        cl_context_properties prop[] =
+        CV_Assert(h);
+
+        std::string configuration = cv::format("@ctx-%p", (void*)h);
+        Impl* impl = findContext(configuration);
+        if (impl)
          {
-            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
-            0
-        };
+            CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
+            impl->addref();
+            return impl;
+        }
  
-        cl_uint nd0 = 0;
-        int dtype = dtype0 & 15;
-        cl_int status = clGetDeviceIDs(pl, dtype, 0, NULL, &nd0);
-        if (status != CL_DEVICE_NOT_FOUND) // Not an error if platform has no devices
+        impl = new Impl(configuration);
+        try
          {
-            CV_OCL_DBG_CHECK_RESULT(status,
-                cv::format("clGetDeviceIDs(platform=%p, device_type=%d, num_entries=0, devices=NULL, numDevices=%p)", pl, dtype, &nd0).c_str());
+            CV_OCL_CHECK(clRetainContext(h));
+            impl->handle = h;
+            impl->init_device_list();
+            return impl;
          }
+        catch (...)
+        {
+            delete impl;
+            throw;
+        }
+    }
  
-        if (nd0 == 0)
-            return;
+    static Impl* findOrCreateContext(const ocl::Device& device)
+    {
+        CV_TRACE_FUNCTION();
  
-        AutoBuffer<void*> dlistbuf(nd0*2+1);
-        cl_device_id* dlist = (cl_device_id*)dlistbuf.data();
-        cl_device_id* dlist_new = dlist + nd0;
-        CV_OCL_DBG_CHECK(clGetDeviceIDs(pl, dtype, nd0, dlist, &nd0));
+        CV_Assert(!device.empty());
+        cl_device_id d = (cl_device_id)device.ptr();
+        CV_Assert(d);
  
-        cl_uint i, nd = 0;
-        String name0;
-        for(i = 0; i < nd0; i++)
+        std::string configuration = cv::format("@dev-%p", (void*)d);
+        Impl* impl = findContext(configuration);
+        if (impl)
          {
-            Device d(dlist[i]);
-            if( !d.available() || !d.compilerAvailable() )
-                continue;
-            if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
-                continue;
-            if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
-                continue;
-            String name = d.name();
-            if( nd != 0 && name != name0 )
-                continue;
-            name0 = name;
-            dlist_new[nd++] = dlist[i];
+            CV_LOG_INFO(NULL, "OpenCL: reuse context@" << impl->contextId << " for configuration: " << configuration)
+            impl->addref();
+            return impl;
          }
  
-        if(nd == 0)
+        impl = new Impl(configuration);
+        try
+        {
+            impl->createFromDevice(d);
+            CV_Assert(impl->handle);
+            return impl;
+        }
+        catch (...)
+        {
+            delete impl;
+            throw;
+        }
+    }
+
+    void setDefault()
+    {
+        CV_TRACE_FUNCTION();
+        cl_device_id d = selectOpenCLDevice();
+
+        if (d == NULL)
              return;
  
+        createFromDevice(d);
+    }
+
+    void createFromDevice(cl_device_id d)
+    {
+        CV_TRACE_FUNCTION();
+        CV_Assert(handle == NULL);
+
+        cl_platform_id pl = NULL;
+        CV_OCL_DBG_CHECK(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL));
+
+        cl_context_properties prop[] =
+        {
+            CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
+            0
+        };
+
          // !!! in the current implementation force the number of devices to 1 !!!
-        nd = 1;
+        cl_uint nd = 1;
+        cl_int status;
  
-        handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
-        CV_OCL_DBG_CHECK_RESULT(retval, "clCreateContext");
-        bool ok = handle != 0 && retval == CL_SUCCESS;
+        handle = clCreateContext(prop, nd, &d, 0, 0, &status);
+        CV_OCL_DBG_CHECK_RESULT(status, "clCreateContext");
+
+        bool ok = handle != 0 && status == CL_SUCCESS;
          if( ok )
          {
              devices.resize(nd);
-            for( i = 0; i < nd; i++ )
-                devices[i].set(dlist_new[i]);
+            devices[0].set(d);
          }
-    }
-
-    ~Impl()
-    {
-        if(handle)
-        {
-            CV_OCL_DBG_CHECK(clReleaseContext(handle));
+        else
              handle = NULL;
-        }
-        devices.clear();
      }
  
      Program getProg(const ProgramSource& src, const String& buildflags, String& errmsg);
@@ -2130,6 +2641,9 @@ struct Context::Impl
  
      IMPLEMENT_REFCOUNTABLE();
  
+    const int contextId;  // global unique ID
+    const std::string configuration;
+
      cl_context handle;
      std::vector<Device> devices;
  
@@ -2142,6 +2656,34 @@ struct Context::Impl
      typedef std::list<cv::String> CacheList;
      CacheList cacheList;
  
+    std::shared_ptr<OpenCLBufferPoolImpl> bufferPool_;
+    std::shared_ptr<OpenCLBufferPoolImpl> bufferPoolHostPtr_;
+    OpenCLBufferPoolImpl& getBufferPool() const
+    {
+        _init_buffer_pools();
+        CV_DbgAssert(bufferPool_);
+        return *bufferPool_.get();
+    }
+    OpenCLBufferPoolImpl& getBufferPoolHostPtr() const
+    {
+        _init_buffer_pools();
+        CV_DbgAssert(bufferPoolHostPtr_);
+        return *bufferPoolHostPtr_.get();
+    }
+
+#ifdef HAVE_DIRECTX
+    directx::internal::OpenCLDirectXImpl* p_directx_impl;
+
+    directx::internal::OpenCLDirectXImpl* getDirectXImpl()
+    {
+        if (!p_directx_impl)
+        {
+            p_directx_impl = directx::internal::createDirectXImpl();
+        }
+        return p_directx_impl;
+    }
+#endif
+
  #ifdef HAVE_OPENCL_SVM
      bool svmInitialized;
      bool svmAvailable;
@@ -2277,6 +2819,15 @@ struct Context::Impl
          svmFunctions.fn_clSVMAlloc = NULL;
          return;
      }
+
+    std::shared_ptr<OpenCLSVMBufferPoolImpl> bufferPoolSVM_;
+
+    OpenCLSVMBufferPoolImpl& getBufferPoolSVM() const
+    {
+        _init_buffer_pools();
+        CV_DbgAssert(bufferPoolSVM_);
+        return *bufferPoolSVM_.get();
+    }
  #endif
  
      friend class Program;
@@ -2288,49 +2839,66 @@ Context::Context()
      p = 0;
  }
  
+Context::~Context()
+{
+    release();
+}
+
+// deprecated
  Context::Context(int dtype)
  {
      p = 0;
      create(dtype);
  }
  
-bool Context::create()
+void Context::release()
  {
-    if( !haveOpenCL() )
-        return false;
-    if(p)
-        p->release();
-    p = new Impl();
-    if(!p->handle)
+    if (p)
      {
-        delete p;
-        p = 0;
+        p->release();
+        p = NULL;
      }
-    return p != 0;
  }
  
-bool Context::create(int dtype0)
+bool Context::create()
  {
-    if( !haveOpenCL() )
+    release();
+    if (!haveOpenCL())
          return false;
-    if(p)
-        p->release();
-    p = new Impl(dtype0);
-    if(!p->handle)
-    {
-        delete p;
-        p = 0;
-    }
-    return p != 0;
+    p = Impl::findOrCreateContext(std::string());
+    if (p && p->handle)
+        return true;
+    release();
+    return false;
  }
  
-Context::~Context()
+// deprecated
+bool Context::create(int dtype)
  {
-    if (p)
+    if( !haveOpenCL() )
+        return false;
+    release();
+    if (dtype == CL_DEVICE_TYPE_DEFAULT || (unsigned)dtype == (unsigned)CL_DEVICE_TYPE_ALL)
      {
-        p->release();
-        p = NULL;
+        p = Impl::findOrCreateContext("");
+    }
+    else if (dtype == CL_DEVICE_TYPE_GPU)
+    {
+        p = Impl::findOrCreateContext(":GPU:");
+    }
+    else if (dtype == CL_DEVICE_TYPE_CPU)
+    {
+        p = Impl::findOrCreateContext(":CPU:");
+    }
+    else
+    {
+        CV_LOG_ERROR(NULL, "OpenCL: Can't recognize OpenCV device type=" << dtype);
+    }
+    if (p && !p->handle)
+    {
+        release();
      }
+    return p != 0;
  }
  
  Context::Context(const Context& c)
@@ -2361,7 +2929,7 @@ size_t Context::ndevices() const
      return p ? p->devices.size() : 0;
  }
  
-const Device& Context::device(size_t idx) const
+Device& Context::device(size_t idx) const
  {
      static Device dummy;
      return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
@@ -2369,23 +2937,16 @@ const Device& Context::device(size_t idx) const
  
  Context& Context::getDefault(bool initialize)
  {
-    static Context* ctx = new Context();
-    if(!ctx->p && haveOpenCL())
+    auto& c = OpenCLExecutionContext::getCurrent();
+    if (!c.empty())
      {
-        if (!ctx->p)
-            ctx->p = new Impl();
-        if (initialize)
-        {
-            // do not create new Context right away.
-            // First, try to retrieve existing context of the same type.
-            // In its turn, Platform::getContext() may call Context::create()
-            // if there is no such context.
-            if (ctx->p->handle == NULL)
-                ctx->p->setDefault();
-        }
+        auto& ctx = c.getContext();
+        return ctx;
      }
  
-    return *ctx;
+    CV_UNUSED(initialize);
+    static Context dummy;
+    return dummy;
  }
  
  Program Context::getProg(const ProgramSource& prog,
@@ -2400,6 +2961,30 @@ void Context::unloadProg(Program& prog)
          p->unloadProg(prog);
  }
  
+/* static */
+Context Context::fromHandle(void* context)
+{
+    Context ctx;
+    ctx.p = Impl::findOrCreateContext((cl_context)context);
+    return ctx;
+}
+
+/* static */
+Context Context::fromDevice(const ocl::Device& device)
+{
+    Context ctx;
+    ctx.p = Impl::findOrCreateContext(device);
+    return ctx;
+}
+
+/* static */
+Context Context::create(const std::string& configuration)
+{
+    Context ctx;
+    ctx.p = Impl::findOrCreateContext(configuration);
+    return ctx;
+}
+
  #ifdef HAVE_OPENCL_SVM
  bool Context::useSVM() const
  {
@@ -2483,12 +3068,23 @@ static void get_platform_name(cl_platform_id id, String& name)
  */
  void attachContext(const String& platformName, void* platformID, void* context, void* deviceID)
  {
-    cl_uint cnt = 0;
+    auto ctx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
+    ctx.bind();
+}
  
+/* static */
+OpenCLExecutionContext OpenCLExecutionContext::create(
+        const std::string& platformName, void* platformID, void* context, void* deviceID
+)
+{
+    if (!haveOpenCL())
+        CV_Error(cv::Error::OpenCLApiCallError, "OpenCL runtime is not available!");
+
+    cl_uint cnt = 0;
      CV_OCL_CHECK(clGetPlatformIDs(0, 0, &cnt));
  
      if (cnt == 0)
-        CV_Error(cv::Error::OpenCLApiCallError, "no OpenCL platform available!");
+        CV_Error(cv::Error::OpenCLApiCallError, "No OpenCL platform available!");
  
      std::vector<cl_platform_id> platforms(cnt);
  
@@ -2518,44 +3114,25 @@ void attachContext(const String& platformName, void* platformID, void* context,
      if (platformName != actualPlatformName)
          CV_Error(cv::Error::OpenCLApiCallError, "No matched platforms available!");
  
-    // do not initialize OpenCL context
-    Context ctx = Context::getDefault(false);
-
-    // attach supplied context to OpenCV
-    initializeContextFromHandle(ctx, platformID, context, deviceID);
-
-    CV_OCL_CHECK(clRetainContext((cl_context)context));
-
-    // clear command queue, if any
-    CoreTLSData& data = getCoreTlsData();
-    data.oclQueue.finish();
-    Queue q;
-    data.oclQueue = q;
-
-    return;
-} // attachContext()
-
+    OpenCLExecutionContext ctx;
+    ctx.p = std::make_shared<OpenCLExecutionContext::Impl>((cl_platform_id)platformID, (cl_context)context, (cl_device_id)deviceID);
+    CV_OCL_CHECK(clReleaseContext((cl_context)context));
+    CV_OCL_CHECK(clReleaseDevice((cl_device_id)deviceID));
+    return ctx;
+}
  
-void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
+void initializeContextFromHandle(Context& ctx, void* _platform, void* _context, void* _device)
  {
+    // internal call, less checks
+    cl_platform_id platformID = (cl_platform_id)_platform;
      cl_context context = (cl_context)_context;
-    cl_device_id device = (cl_device_id)_device;
+    cl_device_id deviceID = (cl_device_id)_device;
  
-    // cleanup old context
-    Context::Impl * impl = ctx.p;
-    if (impl->handle)
-    {
-        CV_OCL_DBG_CHECK(clReleaseContext(impl->handle));
-    }
-    impl->devices.clear();
-
-    impl->handle = context;
-    impl->devices.resize(1);
-    impl->devices[0].set(device);
+    std::string platformName = PlatformInfo(&platformID).name();
  
-    Platform& p = Platform::getDefault();
-    Platform::Impl* pImpl = p.p;
-    pImpl->handle = (cl_platform_id)platform;
+    auto clExecCtx = OpenCLExecutionContext::create(platformName, platformID, context, deviceID);
+    CV_Assert(!clExecCtx.empty());
+    ctx = clExecCtx.getContext();
  }
  
  /////////////////////////////////////////// Queue /////////////////////////////////////////////
@@ -2718,10 +3295,14 @@ void* Queue::ptr() const
  
  Queue& Queue::getDefault()
  {
-    Queue& q = getCoreTlsData().oclQueue;
-    if( !q.p && haveOpenCL() )
-        q.create(Context::getDefault());
-    return q;
+    auto& c = OpenCLExecutionContext::getCurrent();
+    if (!c.empty())
+    {
+        auto& q = c.getQueue();
+        return q;
+    }
+    static Queue dummy;
+    return dummy;
  }
  
  static cl_command_queue getQueue(const Queue& q)
@@ -3012,8 +3593,8 @@ int Kernel::set(int i, const KernelArg& arg)
      cl_int status = 0;
      if( arg.m )
      {
-        int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
-                          ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
+        AccessFlag accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : static_cast<AccessFlag>(0)) |
+                                 ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : static_cast<AccessFlag>(0));
          bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
          if (ptronly && arg.m->empty())
          {
@@ -3099,7 +3680,7 @@ int Kernel::set(int i, const KernelArg& arg)
                  i += 3;
              }
          }
-        p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
+        p->addUMat(*arg.m, !!(accessFlags & ACCESS_WRITE));
          return i;
      }
      status = clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj);
@@ -3187,9 +3768,9 @@ bool Kernel::Impl::run(int dims, size_t globalsize[], size_t localsize[],
      if (retval != CL_SUCCESS)
  #endif
      {
-        cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%dx%dx%d, localsize=%s) sync=%s", name.c_str(), (int)dims,
+        cv::String msg = cv::format("clEnqueueNDRangeKernel('%s', dims=%d, globalsize=%zux%zux%zu, localsize=%s) sync=%s", name.c_str(), (int)dims,
                          globalsize[0], (dims > 1 ? globalsize[1] : 1), (dims > 2 ? globalsize[2] : 1),
-                        (localsize ? cv::format("%dx%dx%d", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
+                        (localsize ? cv::format("%zux%zux%zu", localsize[0], (dims > 1 ? localsize[1] : 1), (dims > 2 ? localsize[2] : 1)) : cv::String("NULL")).c_str(),
                          sync ? "true" : "false"
                          );
          if (retval != CL_SUCCESS)
@@ -3400,7 +3981,7 @@ struct ProgramSource::Impl
          default:
              CV_Error(Error::StsInternal, "Internal error");
          }
-        sourceHash_ = cv::format("%08llx", hash);
+        sourceHash_ = cv::format("%08jx", (uintmax_t)hash);
          isHashUpdated = true;
      }
  
@@ -4536,14 +5117,32 @@ private:
  #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
  #endif
  
-class OpenCLAllocator CV_FINAL : public MatAllocator
+
+void Context::Impl::__init_buffer_pools()
  {
-    mutable OpenCLBufferPoolImpl bufferPool;
-    mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
-#ifdef  HAVE_OPENCL_SVM
-    mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
+    bufferPool_ = std::make_shared<OpenCLBufferPoolImpl>(0);
+    OpenCLBufferPoolImpl& bufferPool = *bufferPool_.get();
+    bufferPoolHostPtr_ = std::make_shared<OpenCLBufferPoolImpl>(CL_MEM_ALLOC_HOST_PTR);
+    OpenCLBufferPoolImpl& bufferPoolHostPtr = *bufferPoolHostPtr_.get();
+
+    size_t defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
+    size_t poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
+    bufferPool.setMaxReservedSize(poolSize);
+    size_t poolSizeHostPtr = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
+    bufferPoolHostPtr.setMaxReservedSize(poolSizeHostPtr);
+
+#ifdef HAVE_OPENCL_SVM
+    bufferPoolSVM_ = std::make_shared<OpenCLSVMBufferPoolImpl>();
+    OpenCLSVMBufferPoolImpl& bufferPoolSVM = *bufferPoolSVM_.get();
+    size_t poolSizeSVM = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
+    bufferPoolSVM.setMaxReservedSize(poolSizeSVM);
  #endif
  
+    CV_LOG_INFO(NULL, "OpenCL: Initializing buffer pool for context@" << contextId << " with max capacity: poolSize=" << poolSize << " poolSizeHostPtr=" << poolSizeHostPtr);
+}
+
+class OpenCLAllocator CV_FINAL : public MatAllocator
+{
  public:
      enum AllocatorFlags
      {
@@ -4556,20 +5155,7 @@ public:
      };
  
      OpenCLAllocator()
-        : bufferPool(0),
-          bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
-    {
-        size_t defaultPoolSize, poolSize;
-        defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
-        poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
-        bufferPool.setMaxReservedSize(poolSize);
-        poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
-        bufferPoolHostPtr.setMaxReservedSize(poolSize);
-#ifdef HAVE_OPENCL_SVM
-        poolSize = utils::getConfigurationParameterSizeT("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
-        bufferPoolSVM.setMaxReservedSize(poolSize);
-#endif
-
+    {
          matStdAllocator = Mat::getDefaultAllocator();
      }
      ~OpenCLAllocator()
@@ -4578,7 +5164,7 @@ public:
      }
  
      UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
-            int flags, UMatUsageFlags usageFlags) const
+            AccessFlag flags, UMatUsageFlags usageFlags) const
      {
          UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
          return u;
@@ -4595,7 +5181,7 @@ public:
          return value;
      }
  
-    void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
+    void getBestFlags(const Context& ctx, AccessFlag /*flags*/, UMatUsageFlags usageFlags, int& createFlags, UMatData::MemoryFlag& flags0) const
      {
          const Device& dev = ctx.device(0);
          createFlags = 0;
@@ -4611,16 +5197,19 @@ public:
                  )
              )
          )
-            flags0 = 0;
+            flags0 = static_cast<UMatData::MemoryFlag>(0);
          else
              flags0 = UMatData::COPY_ON_MAP;
      }
  
      UMatData* allocate(int dims, const int* sizes, int type,
-                       void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
+                       void* data, size_t* step, AccessFlag flags, UMatUsageFlags usageFlags) const CV_OVERRIDE
      {
          if(!useOpenCL())
              return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
+
+        flushCleanupQueue();
+
          CV_Assert(data == 0);
          size_t total = CV_ELEM_SIZE(type);
          for( int i = dims-1; i >= 0; i-- )
@@ -4631,9 +5220,12 @@ public:
          }
  
          Context& ctx = Context::getDefault();
-        flushCleanupQueue();
+        if (!ctx.getImpl())
+            return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
+        Context::Impl& ctxImpl = *ctx.getImpl();
  
-        int createFlags = 0, flags0 = 0;
+        int createFlags = 0;
+        UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
          getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
  
          void* handle = NULL;
@@ -4644,7 +5236,7 @@ public:
          if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
          {
              allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
-            handle = bufferPoolSVM.allocate(total);
+            handle = ctxImpl.getBufferPoolSVM().allocate(total);
  
              // this property is constant, so single buffer pool can be used here
              bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
@@ -4655,12 +5247,12 @@ public:
          if (createFlags == 0)
          {
              allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
-            handle = bufferPool.allocate(total);
+            handle = ctxImpl.getBufferPool().allocate(total);
          }
          else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
          {
              allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
-            handle = bufferPoolHostPtr.allocate(total);
+            handle = ctxImpl.getBufferPoolHostPtr().allocate(total);
          }
          else
          {
@@ -4676,13 +5268,14 @@ public:
          u->handle = handle;
          u->flags = flags0;
          u->allocatorFlags_ = allocatorFlags;
+        u->allocatorContext = std::static_pointer_cast<void>(std::make_shared<ocl::Context>(ctx));
          CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
          u->markHostCopyObsolete(true);
          opencl_allocator_stats.onAllocate(u->size);
          return u;
      }
  
-    bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
+    bool allocate(UMatData* u, AccessFlag accessFlags, UMatUsageFlags usageFlags) const CV_OVERRIDE
      {
          if(!u)
              return false;
@@ -4695,14 +5288,15 @@ public:
          {
              CV_Assert(u->origdata != 0);
              Context& ctx = Context::getDefault();
-            int createFlags = 0, flags0 = 0;
+            int createFlags = 0;
+            UMatData::MemoryFlag flags0 = static_cast<UMatData::MemoryFlag>(0);
              getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
  
              bool copyOnMap = (flags0 & UMatData::COPY_ON_MAP) != 0;
  
              cl_context ctx_handle = (cl_context)ctx.ptr();
              int allocatorFlags = 0;
-            int tempUMatFlags = 0;
+            UMatData::MemoryFlag tempUMatFlags = static_cast<UMatData::MemoryFlag>(0);
              void* handle = NULL;
              cl_int retval = CL_SUCCESS;
  
@@ -4798,7 +5392,7 @@ public:
              u->flags |= tempUMatFlags | flags0;
              u->allocatorFlags_ = allocatorFlags;
          }
-        if(accessFlags & ACCESS_WRITE)
+        if (!!(accessFlags & ACCESS_WRITE))
              u->markHostCopyObsolete(true);
          opencl_allocator_stats.onAllocate(u->size);
          return true;
@@ -4845,7 +5439,7 @@ public:
          CV_Assert(u->handle != 0);
          CV_Assert(u->mapcount == 0);
  
-        if (u->flags & UMatData::ASYNC_CLEANUP)
+        if (!!(u->flags & UMatData::ASYNC_CLEANUP))
              addToCleanupQueue(u);
          else
              deallocate_(u);
@@ -4987,15 +5581,26 @@ public:
              }
              if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
              {
-                bufferPool.release((cl_mem)u->handle);
+                std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
+                CV_Assert(pCtx);
+                ocl::Context& ctx = *pCtx.get();
+                CV_Assert(ctx.getImpl());
+                ctx.getImpl()->getBufferPool().release((cl_mem)u->handle);
              }
              else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
              {
-                bufferPoolHostPtr.release((cl_mem)u->handle);
+                std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
+                CV_Assert(pCtx);
+                ocl::Context& ctx = *pCtx.get();
+                CV_Assert(ctx.getImpl());
+                ctx.getImpl()->getBufferPoolHostPtr().release((cl_mem)u->handle);
              }
  #ifdef HAVE_OPENCL_SVM
              else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
              {
+                std::shared_ptr<ocl::Context> pCtx = std::static_pointer_cast<ocl::Context>(u->allocatorContext);
+                CV_Assert(pCtx);
+                ocl::Context& ctx = *pCtx.get();
                  if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
                  {
                      //nothing
@@ -5003,7 +5608,6 @@ public:
                  else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
                          (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
                  {
-                    Context& ctx = Context::getDefault();
                      const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
                      CV_DbgAssert(svmFns->isValid());
                      cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
@@ -5015,7 +5619,8 @@ public:
                          CV_OCL_CHECK_RESULT(status, "clEnqueueSVMUnmap()");
                      }
                  }
-                bufferPoolSVM.release((void*)u->handle);
+                CV_Assert(ctx.getImpl());
+                ctx.getImpl()->getBufferPoolSVM().release((void*)u->handle);
              }
  #endif
              else
@@ -5031,11 +5636,11 @@ public:
      }
  
      // synchronized call (external UMatDataAutoLock, see UMat::getMat)
-    void map(UMatData* u, int accessFlags) const CV_OVERRIDE
+    void map(UMatData* u, AccessFlag accessFlags) const CV_OVERRIDE
      {
          CV_Assert(u && u->handle);
  
-        if(accessFlags & ACCESS_WRITE)
+        if (!!(accessFlags & ACCESS_WRITE))
              u->markDeviceCopyObsolete(true);
  
          cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
@@ -5102,7 +5707,7 @@ public:
              }
          }
  
-        if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
+        if (!!(accessFlags & ACCESS_READ) && u->hostCopyObsolete())
          {
              AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
  #ifdef HAVE_OPENCL_SVM
@@ -5731,22 +6336,26 @@ public:
          }
      }
  
-    BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE {
+    BufferPoolController* getBufferPoolController(const char* id) const CV_OVERRIDE
+    {
+        ocl::Context ctx = Context::getDefault();
+        if (ctx.empty())
+            return NULL;
  #ifdef HAVE_OPENCL_SVM
          if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
          {
-            return &bufferPoolSVM;
+            return &ctx.getImpl()->getBufferPoolSVM();
          }
  #endif
          if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
          {
-            return &bufferPoolHostPtr;
+            return &ctx.getImpl()->getBufferPoolHostPtr();
          }
          if (id != NULL && strcmp(id, "OCL") != 0)
          {
              CV_Error(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
          }
-        return &bufferPool;
+        return &ctx.getImpl()->getBufferPool();
      }
  
      MatAllocator* matStdAllocator;
@@ -5841,7 +6450,7 @@ void convertFromBuffer(void* cl_mem_buffer, size_t step, int rows, int cols, int
      dst.u = new UMatData(getOpenCLAllocator());
      dst.u->data            = 0;
      dst.u->allocatorFlags_ = OpenCLAllocator::ALLOCATOR_FLAGS_EXTERNAL_BUFFER;  // not allocated from any OpenCV buffer pool
-    dst.u->flags           = 0;
+    dst.u->flags           = static_cast<UMatData::MemoryFlag>(0);
      dst.u->handle          = cl_mem_buffer;
      dst.u->origdata        = 0;
      dst.u->prevAllocator   = 0;
@@ -6109,10 +6718,13 @@ const char* typeToStr(int type)
          "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
          "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
          "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
-        "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
+        "half", "half2", "half3", "half4", 0, 0, 0, "half8", 0, 0, 0, 0, 0, 0, 0, "half16",
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
      };
      int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
-    return cn > 16 ? "?" : tab[depth*16 + cn-1];
+    const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
+    CV_Assert(result);
+    return result;
  }
  
  const char* memopTypeToStr(int type)
@@ -6126,10 +6738,13 @@ const char* memopTypeToStr(int type)
          "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
          "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
          "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
-        "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
+        "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
      };
      int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
-    return cn > 16 ? "?" : tab[depth*16 + cn-1];
+    const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
+    CV_Assert(result);
+    return result;
  }
  
  const char* vecopTypeToStr(int type)
@@ -6143,10 +6758,13 @@ const char* vecopTypeToStr(int type)
          "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
          "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
          "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
-        "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
+        "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
      };
      int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
-    return cn > 16 ? "?" : tab[depth*16 + cn-1];
+    const char* result = cn > 16 ? 0 : tab[depth*16 + cn-1];
+    CV_Assert(result);
+    return result;
  }
  
  const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
@@ -6731,4 +7349,15 @@ uint64 Timer::durationNS() const
  
  }} // namespace
  
+#ifdef HAVE_DIRECTX
+namespace cv { namespace directx { namespace internal {
+OpenCLDirectXImpl* getDirectXImpl(ocl::Context& ctx)
+{
+    ocl::Context::Impl* i = ctx.getImpl();
+    CV_Assert(i);
+    return i->getDirectXImpl();
+}
+}}} // namespace cv::directx::internal
+#endif
+
  #endif // HAVE_OPENCL