Merge remote-tracking branch 'upstream/3.4' into merge-3.4

[platform/upstream/opencv.git] / modules / dnn / src / dnn.cpp
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp

index c50dae7..f39305e 100644 (file)
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -43,8 +43,16 @@
  #include "op_halide.hpp"
  #include "op_inf_engine.hpp"
  #include "ie_ngraph.hpp"
+#include "op_vkcom.hpp"
+#include "op_cuda.hpp"
+
+#ifdef HAVE_CUDA
+#include "cuda4dnn/init.hpp"
+#include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers
+#endif
  
  #include "halide_scheduler.hpp"
+
  #include <set>
  #include <algorithm>
  #include <iostream>
@@ -52,6 +60,7 @@
  #include <fstream>
  #include <iterator>
  #include <numeric>
+#include <memory>
  #include <opencv2/dnn/shape_utils.hpp>
  #include <opencv2/imgproc.hpp>
  
@@ -60,7 +69,7 @@
  
  namespace cv {
  namespace dnn {
-CV__DNN_EXPERIMENTAL_NS_BEGIN
+CV__DNN_INLINE_NS_BEGIN
  
  static size_t DNN_NETWORK_DUMP = utils::getConfigurationParameterSizeT("OPENCV_DNN_NETWORK_DUMP", 0);
  
@@ -213,6 +222,20 @@ private:
  #endif
  
          backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
+
+#ifdef HAVE_VULKAN
+        if (haveVulkan())
+            backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
+#endif
+
+#ifdef HAVE_CUDA
+        if (haveCUDA() && cuda4dnn::isDeviceCompatible())
+        {
+            backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
+            if (cuda4dnn::doesDeviceSupportFP16())
+                backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
+        }
+#endif
      }
  
      BackendsList backends;
@@ -549,6 +572,13 @@ struct LayerData
      std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
      std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
  
+#ifdef HAVE_CUDA
+    /* output ids which must be transferred to the host in the background
+     * after the completion of the forward pass of the layer
+     */
+    std::vector<int> cudaD2HBackgroundTransfers;
+#endif
+
      Ptr<Layer> layerInstance;
      std::vector<Mat> outputBlobs;
      std::vector<Mat*> inputBlobs;
@@ -1075,6 +1105,29 @@ static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
          CV_Error(Error::StsNotImplemented, "This OpenCV version is built without support of Inference Engine + nGraph");
  #endif
      }
+    else if (backendId == DNN_BACKEND_VKCOM)
+    {
+        CV_Assert(haveVulkan());
+#ifdef HAVE_VULKAN
+        return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
+#endif  // HAVE_VULKAN
+    }
+    else if (backendId == DNN_BACKEND_CUDA)
+    {
+        CV_Assert(haveCUDA());
+
+#ifdef HAVE_CUDA
+        switch (targetId)
+        {
+        case DNN_TARGET_CUDA:
+            return CUDABackendWrapperFP32::create(m);
+        case DNN_TARGET_CUDA_FP16:
+            return CUDABackendWrapperFP16::create(m);
+        default:
+            CV_Assert(IS_DNN_CUDA_TARGET(targetId));
+        }
+#endif
+    }
      else
          CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
      return Ptr<BackendWrapper>();  // TODO Error?
@@ -1141,6 +1194,19 @@ struct Net::Impl : public detail::NetImplBase
      std::vector<int64> layersTimings;
      Mat output_blob;
  
+#ifdef HAVE_CUDA
+    struct CudaInfo_t
+    {
+        CudaInfo_t(cuda4dnn::csl::CSLContext ctxt, cuda4dnn::csl::Stream d2h_stream_)
+         : context(std::move(ctxt)), d2h_stream(std::move(d2h_stream_)) { }
+        cuda4dnn::csl::CSLContext context;
+        cuda4dnn::csl::Stream d2h_stream;
+        cuda4dnn::csl::Workspace workspace;
+    };
+
+    std::unique_ptr<CudaInfo_t> cudaInfo;
+#endif
+
      Ptr<BackendWrapper> wrap(Mat& host)
      {
          if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
@@ -1178,6 +1244,27 @@ struct Net::Impl : public detail::NetImplBase
              {
                  return wrapMat(preferableBackend, preferableTarget, host);
              }
+            else if (preferableBackend == DNN_BACKEND_VKCOM)
+            {
+  #ifdef HAVE_VULKAN
+                return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
+  #endif
+            }
+            else if (preferableBackend == DNN_BACKEND_CUDA)
+            {
+                CV_Assert(haveCUDA());
+#ifdef HAVE_CUDA
+                switch (preferableTarget)
+                {
+                case DNN_TARGET_CUDA:
+                    return CUDABackendWrapperFP32::create(baseBuffer, shape);
+                case DNN_TARGET_CUDA_FP16:
+                    return CUDABackendWrapperFP16::create(baseBuffer, shape);
+                default:
+                    CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
+                }
+#endif
+            }
              else
                  CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
          }
@@ -1295,6 +1382,10 @@ struct Net::Impl : public detail::NetImplBase
                    preferableTarget == DNN_TARGET_FPGA
              );
          }
+        CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
+                  preferableTarget == DNN_TARGET_VULKAN);
+        CV_Assert(preferableBackend != DNN_BACKEND_CUDA ||
+                  IS_DNN_CUDA_TARGET(preferableTarget));
          if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
          {
              if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
@@ -1324,6 +1415,23 @@ struct Net::Impl : public detail::NetImplBase
                  }
              }
  #endif
+            if (preferableBackend == DNN_BACKEND_VKCOM && !haveVulkan())
+            {
+                preferableBackend = DNN_BACKEND_OPENCV;
+                preferableTarget = DNN_TARGET_CPU;
+            }
+
+            if (preferableBackend == DNN_BACKEND_CUDA && !haveCUDA())
+            {
+#ifdef HAVE_CUDA
+                CV_LOG_WARNING(NULL, "unable to use CUDA backend; switching to CPU");
+#else
+                CV_LOG_WARNING(NULL, "DNN module was not built with CUDA backend; switching to CPU");
+#endif
+                preferableBackend = DNN_BACKEND_OPENCV;
+                preferableTarget = DNN_TARGET_CPU;
+            }
+
              clear();
  
              this->blobsToKeep = blobsToKeep_;
@@ -1336,7 +1444,7 @@ struct Net::Impl : public detail::NetImplBase
  
              initBackend(blobsToKeep_);
  
-            if (!netWasAllocated )
+            if (!netWasAllocated)
              {
  #ifdef HAVE_HALIDE
                  if (preferableBackend == DNN_BACKEND_HALIDE)
@@ -1477,7 +1585,9 @@ struct Net::Impl : public detail::NetImplBase
      {
          CV_TRACE_FUNCTION();
          if (preferableBackend == DNN_BACKEND_OPENCV)
+        {
              CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
+        }
          else if (preferableBackend == DNN_BACKEND_HALIDE)
              initHalideBackend();
          else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
@@ -1496,6 +1606,10 @@ struct Net::Impl : public detail::NetImplBase
              CV_Error(Error::StsNotImplemented, "This OpenCV version is built without support of Inference Engine + nGraph");
  #endif
          }
+        else if (preferableBackend == DNN_BACKEND_VKCOM)
+            initVkComBackend();
+        else if (preferableBackend == DNN_BACKEND_CUDA)
+            initCUDABackend(blobsToKeep_);
          else
              CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
      }
@@ -2193,6 +2307,125 @@ struct Net::Impl : public detail::NetImplBase
      }
  #endif  // HAVE_DNN_NGRAPH
  
+    void initVkComBackend()
+    {
+        CV_TRACE_FUNCTION();
+        CV_Assert(preferableBackend == DNN_BACKEND_VKCOM);
+#ifdef HAVE_VULKAN
+        if (!haveVulkan())
+            return;
+
+        MapIdToLayerData::iterator it = layers.begin();
+        for (; it != layers.end(); it++)
+        {
+            LayerData &ld = it->second;
+            Ptr<Layer> layer = ld.layerInstance;
+            if (!layer->supportBackend(preferableBackend))
+            {
+                continue;
+            }
+
+            ld.skip = false;
+
+            try
+            {
+                ld.backendNodes[DNN_BACKEND_VKCOM] =
+                    layer->initVkCom(ld.inputBlobsWrappers);
+            }
+            catch (const cv::Exception& e)
+            {
+                CV_LOG_ERROR(NULL, "initVkCom failed, fallback to CPU implementation. " << e.what());
+                ld.backendNodes[DNN_BACKEND_VKCOM] = Ptr<BackendNode>();
+            }
+        }
+#endif
+    }
+
+    void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_)
+    {
+        CV_Assert(haveCUDA());
+        CV_Assert(preferableBackend == DNN_BACKEND_CUDA);
+
+#ifdef HAVE_CUDA
+        if (cuda4dnn::getDeviceCount() <= 0)
+            CV_Error(Error::StsError, "No CUDA capable device found.");
+
+        if (cuda4dnn::getDevice() < 0)
+            CV_Error(Error::StsError, "No CUDA capable device selected.");
+
+        if (!cuda4dnn::isDeviceCompatible())
+            CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration.");
+
+        if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16())
+            CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations.");
+
+        if (!cudaInfo)
+        {
+            cuda4dnn::csl::CSLContext context;
+            context.stream = cuda4dnn::csl::Stream(true);
+            context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
+            context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
+
+            auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
+            cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
+            cuda4dnn::checkVersions();
+        }
+
+        cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any
+
+        for (auto& layer : layers)
+        {
+            auto& ld = layer.second;
+            if (ld.id == 0)
+            {
+                for (auto& wrapper : ld.inputBlobsWrappers)
+                {
+                    auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+                    cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
+                }
+            }
+
+            for (auto& wrapper : ld.outputBlobsWrappers)
+            {
+                auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
+                cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
+            }
+        }
+
+        for (auto& layer : layers)
+        {
+            auto& ld = layer.second;
+            auto& layerInstance = ld.layerInstance;
+
+            if (!layerInstance->supportBackend(DNN_BACKEND_CUDA))
+            {
+                std::ostringstream os;
+                os << "CUDA backend will fallback to the CPU implementation for the layer \"" << ld.name
+                   << "\" of type " << ld.type << '\n';
+                CV_LOG_INFO(NULL, os.str().c_str());
+                continue;
+            }
+
+            /* we make a copy so that `initCUDA` doesn't modify `cudaInfo->context` */
+            auto context = cudaInfo->context;
+            auto node = layerInstance->initCUDA(&context, ld.inputBlobsWrappers, ld.outputBlobsWrappers);
+            ld.backendNodes[DNN_BACKEND_CUDA] = node;
+
+            auto cudaNode = node.dynamicCast<CUDABackendNode>();
+            cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
+        }
+
+        if (blobsToKeep_.size() > 1)
+        {
+            for (const auto& pin : blobsToKeep_)
+            {
+                LayerData& ld = layers[pin.lid];
+                ld.cudaD2HBackgroundTransfers.push_back(pin.oid);
+            }
+        }
+#endif
+    }
+
      void allocateLayer(int lid, const LayersShapesMap& layersShapes)
      {
          CV_TRACE_FUNCTION();
@@ -2236,9 +2469,7 @@ struct Net::Impl : public detail::NetImplBase
              ninputs = netInputLayer->inputsData.size();
              ld.inputBlobsWrappers.resize(ninputs);
              for (size_t i = 0; i < ninputs; i++)
-            {
                  ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
-            }
          }
          else
          {
@@ -2264,14 +2495,12 @@ struct Net::Impl : public detail::NetImplBase
                                            preferableTarget == DNN_TARGET_OPENCL_FP16);
          ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
          for (int i = 0; i < ld.outputBlobs.size(); ++i)
-        {
              ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
-        }
-        ld.internalBlobsWrappers.resize(ld.internals.size());
-        for (int i = 0; i < ld.internals.size(); ++i)
-        {
+
+        /* CUDA backend has its own system for internal blobs; we don't need these */
+        ld.internalBlobsWrappers.resize((preferableBackend == DNN_BACKEND_CUDA) ? 0 : ld.internals.size());
+        for (int i = 0; i < ld.internalBlobsWrappers.size(); ++i)
              ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
-        }
  
          Ptr<Layer> layerPtr = ld.getLayerInstance();
          {
@@ -2311,6 +2540,7 @@ struct Net::Impl : public detail::NetImplBase
          CV_TRACE_FUNCTION();
  
          if(!fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
+                        preferableBackend != DNN_BACKEND_CUDA &&
                          preferableBackend != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
                          preferableBackend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH))
             return;
@@ -2342,6 +2572,11 @@ struct Net::Impl : public detail::NetImplBase
                  LayerPin lpNext(ld.consumers[0].lid, 0);
                  while (nextData)
                  {
+                    /* we use `tryFuse` member of convolution layer to fuse eltwise later
+                     * it's not intended to be fused here; hence, we stop when we encounter eltwise
+                     */
+                    if (preferableBackend == DNN_BACKEND_CUDA && ld.type == "Convolution" && nextData->type == "Eltwise")
+                        break;
                      Ptr<Layer> nextLayer = nextData->layerInstance;
                      if (currLayer->tryFuse(nextLayer))
                      {
@@ -2365,7 +2600,7 @@ struct Net::Impl : public detail::NetImplBase
                          break;
                  }
  
-                if (preferableBackend != DNN_BACKEND_OPENCV)
+                if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
                      continue;  // Go to the next layer.
  
                  // TODO: OpenCL target support more fusion styles.
@@ -2375,6 +2610,11 @@ struct Net::Impl : public detail::NetImplBase
                       ld.layerInstance->type != "Concat")) )
                      continue;
  
+                if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget)
+                    && ld.layerInstance->type != "Convolution"
+                    && ld.layerInstance->type != "Concat")
+                    continue;
+
                  while (nextData)
                  {
                      // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
@@ -2412,15 +2652,30 @@ struct Net::Impl : public detail::NetImplBase
                          break;
                  }
  
-                // fuse convolution layer followed by eltwise + relu
-                if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
+                // OpenCL: fuse convolution layer followed by eltwise + relu
+                // CUDA: fuse convolution layer followed by eltwise (and optional activation)
+                if ((IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
+                    ld.layerInstance->type == "Convolution" )
                  {
                      Ptr<EltwiseLayer> nextEltwiseLayer;
                      if( nextData )
                          nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
-
-                    if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
-                        nextData && nextData->inputBlobsId.size() == 2 )
+#ifdef HAVE_CUDA
+                    // CUDA backend supports fusion with eltwise sum (without variable channels)
+                    // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
+                    if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
+                    {
+                        // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
+                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
+                        const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
+                        const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
+                        // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
+                        // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
+                        if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
+                            nextEltwiseLayer = Ptr<EltwiseLayer>();
+                    }
+#endif
+                    if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
                      {
                          LayerData *eltwiseData = nextData;
  
@@ -2449,65 +2704,163 @@ struct Net::Impl : public detail::NetImplBase
                          }
                          CV_Assert(biasLayerData);
                          {
-                            if( eltwiseData->consumers.size() == 1 )
+                            // fuse eltwise + activation layer
+                            // bias must already be computed to fuse => bias layer must appear before convolution
+                            if (biasLayerData->id < ld.id)
                              {
-                                // fuse eltwise + activation layer
-                                if (biasLayerData->id < ld.id)
+                                /* we can fuse activation if:
+                                 * => activation layer that follows is the only consumer of eltwise output
+                                 * => activation layer does not process multiple inputs
+                                 * => we do not require to keep the output of eltwise
+                                 */
+                                Ptr<ActivationLayer> nextFusabeleActivLayer;
+                                if (eltwiseData->consumers.size() == 1 && pinsToKeep.count(lpNext) == 0)
                                  {
                                      nextData = &layers[eltwiseData->consumers[0].lid];
                                      lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
-                                    Ptr<ActivationLayer> nextActivLayer;
-                                    if( nextData )
-                                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
-
-                                    if( !nextActivLayer.empty() &&
-                                            (!nextData->type.compare("ReLU") ||
-                                             !nextData->type.compare("ChannelsPReLU") ||
-                                             !nextData->type.compare("Power")) &&
-                                            currLayer->setActivation(nextActivLayer) )
+                                    CV_Assert(nextData);
+                                    if (nextData->outputBlobs.size() == 1)
+                                        nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+                                }
+                                else
+                                {
+                                    // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise
+                                    nextData = 0;
+                                }
+
+                                // the requirements of OCV OpenCL backend and CUDA backend are different
+                                // we need to check them separately; hence, the fuse variables
+                                bool fuse_eltwise = false, fuse_activation = false;
+
+                                if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() &&
+                                    nextData &&
+                                    (!nextData->type.compare("ReLU") ||
+                                     !nextData->type.compare("ChannelsPReLU") ||
+                                     !nextData->type.compare("Power")) &&
+                                    currLayer->setActivation(nextFusabeleActivLayer))
+                                {
+                                    fuse_eltwise = true;
+                                    fuse_activation = true;
+                                }
+
+                                if (IS_DNN_CUDA_TARGET(preferableTarget))
+                                {
+                                    /* supported fusion options:
+                                     * => convolution + eltwise
+                                     * => activation(convolution) + eltwise
+                                     *    > convolution + activation would have been fused already; we have to fuse eltwise
+                                     * => activation(convolution + eltwise)
+                                     *    > fuse eltwise and then activation
+                                     */
+                                    auto layer = nextEltwiseLayer.staticCast<Layer>();
+                                    if (currLayer->tryFuse(layer))
+                                    {
+                                        fuse_eltwise = true; /* eltwise was successfully fused */
+                                        if (!nextFusabeleActivLayer.empty() && nextData)
+                                        {
+                                            if ((!nextData->type.compare("ReLU") ||
+                                                 !nextData->type.compare("ReLU6") ||
+                                                 !nextData->type.compare("Power") ||
+                                                 !nextData->type.compare("TanH") ||
+                                                 !nextData->type.compare("Sigmoid") ||
+                                                 !nextData->type.compare("Swish") ||
+                                                 !nextData->type.compare("Mish")) &&
+                                                currLayer->setActivation(nextFusabeleActivLayer))
+                                            {
+                                                // activation was fused
+                                                fuse_activation = true;
+                                            }
+                                        }
+                                    }
+                                }
+
+                                CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */
+                                if(fuse_eltwise && fuse_activation)
+                                {
+                                    CV_Assert(nextData);
+                                    CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+                                    ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+                                    printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str()));
+                                    eltwiseData->skip = true;
+                                    nextData->skip = true;
+                                    // This optimization for cases like
+                                    // some_layer   conv
+                                    //   |             |
+                                    //   +-- eltwise --+
+                                    //          |
+                                    //        activ
+                                    // This way all the element-wise computations
+                                    // (i.e. some_layer+conv or some_layer*conv)
+                                    // would be done at [conv] layer. So we need to
+                                    // replace [conv]'s output blob to [eltwise]'s one
+                                    // considering that [activ] is an in-place layer.
+                                    // Also we need to move all the consumers' references.
+                                    // To prevent memory collisions (i.e. when input of
+                                    // [conv] and output of [eltwise] is the same blob)
+                                    // we allocate a new blob.
+                                    CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
+                                    ld.outputBlobs[0] = ld.outputBlobs[0].clone();
+                                    ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
+
+                                    eltwiseData->outputBlobs = ld.outputBlobs;
+                                    nextData->outputBlobs = ld.outputBlobs;
+                                    eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
+                                    nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
+
+                                    // Move references of [activ] layer consumers to the newly allocated blob.
+                                    for (int i = 0; i < nextData->consumers.size(); ++i)
                                      {
-                                        CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
-                                        ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
-                                        printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
-                                        printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
-                                        eltwiseData->skip = true;
-                                        nextData->skip = true;
-                                        // This optimization for cases like
-                                        // some_layer   conv
-                                        //   |             |
-                                        //   +-- eltwise --+
-                                        //          |
-                                        //        activ
-                                        // This way all the element-wise computations
-                                        // (i.e. some_layer+conv or some_layer*conv)
-                                        // would be done at [conv] layer. So we need to
-                                        // replace [conv]'s output blob to [eltwise]'s one
-                                        // considering that [activ] is an in-place layer.
-                                        // Also we need to move all the consumers' references.
-                                        // To prevent memory collisions (i.e. when input of
-                                        // [conv] and output of [eltwise] is the same blob)
-                                        // we allocate a new blob.
-                                        CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
-                                        ld.outputBlobs[0] = ld.outputBlobs[0].clone();
-                                        ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
-
-                                        eltwiseData->outputBlobs = ld.outputBlobs;
-                                        nextData->outputBlobs = ld.outputBlobs;
-                                        eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
-                                        nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
-
-                                        // Move references of [activ] layer consumers to the newly allocated blob.
-                                        for (int i = 0; i < nextData->consumers.size(); ++i)
+                                        LayerData& consumer = layers[nextData->consumers[i].lid];
+                                        for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
                                          {
-                                            LayerData& consumer = layers[nextData->consumers[i].lid];
-                                            for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
+                                            if (consumer.inputBlobsId[j].lid == lpNext.lid)
                                              {
-                                                if (consumer.inputBlobsId[j].lid == lpNext.lid)
-                                                {
-                                                    consumer.inputBlobs[j] = &ld.outputBlobs[0];
-                                                    consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
-                                                    break;
-                                                }
+                                                consumer.inputBlobs[j] = &ld.outputBlobs[0];
+                                                consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+                                                break;
+                                            }
+                                        }
+                                    }
+                                }
+                                else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise)
+                                {
+                                    CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
+                                    CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+                                    ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+                                    eltwiseData->skip = true;
+                                    // This optimization is for cases like
+                                    // some_layer   conv (maybe fused with activ)
+                                    //   |             |
+                                    //   +-- eltwise --+
+                                    //
+                                    // This way all the element-wise computations
+                                    // (i.e. some_layer+conv or some_layer*conv)
+                                    // would be done at [conv] layer. So we need to
+                                    // replace [conv]'s output blob to [eltwise]'s one.
+                                    // Also we need to move all the consumers' references.
+                                    // To prevent memory collisions (i.e. when input of
+                                    // [conv] and output of [eltwise] is the same blob)
+                                    // we allocate a new blob.
+                                    CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
+                                    ld.outputBlobs[0] = ld.outputBlobs[0].clone();
+                                    ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
+
+                                    eltwiseData->outputBlobs = ld.outputBlobs;
+                                    eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
+
+                                    // Move references of [eltwise] layer consumers to the newly allocated blob.
+                                    for (int i = 0; i < eltwiseData->consumers.size(); ++i)
+                                    {
+                                        LayerData& consumer = layers[eltwiseData->consumers[i].lid];
+                                        for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
+                                        {
+                                            if (consumer.inputBlobsId[j].lid == eltwiseData->id)
+                                            {
+                                                consumer.inputBlobs[j] = &ld.outputBlobs[0];
+                                                consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+                                                break;
                                              }
                                          }
                                      }
@@ -2518,7 +2871,7 @@ struct Net::Impl : public detail::NetImplBase
                  }
              }
  
-            if (preferableBackend != DNN_BACKEND_OPENCV)
+            if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
                  continue;  // Go to the next layer.
  
              // the optimization #2. if there is concat layer that concatenates channels
@@ -2586,6 +2939,21 @@ struct Net::Impl : public detail::NetImplBase
  
                          if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
                              break;
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA &&
+                            (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
+                             (inp_i_data->layerInstance->type != "Convolution" &&
+                              inp_i_data->layerInstance->type != "Pooling" &&
+                              inp_i_data->layerInstance->type != "Resize"  &&
+                              inp_i_data->layerInstance->type != "Flatten" &&
+                              inp_i_data->layerInstance->type != "Permute" &&
+                              inp_i_data->layerInstance->type != "Reorg" &&
+                              inp_i_data->layerInstance->type != "Eltwise" &&
+                              inp_i_data->layerInstance.dynamicCast<ActivationLayer>().empty())))
+                        {
+                            break;
+                        }
+#endif
                          realinputs[i] = pin;
                      }
  
@@ -2604,6 +2972,11 @@ struct Net::Impl : public detail::NetImplBase
                              OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
                          }
  #endif
+
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA)
+                            ld.outputBlobsWrappers[0] = wrap(output);
+#endif
                          std::vector<Range> chrange(output.dims, Range::all());
                          int ofs = 0;
                          for( i = 0; i < ninputs; i++ )
@@ -2628,10 +3001,39 @@ struct Net::Impl : public detail::NetImplBase
                                  OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
                              }
  #endif
+#ifdef HAVE_CUDA
+                            if (preferableBackend == DNN_BACKEND_CUDA)
+                            {
+                                auto cuda_wrapper = wrap(output).dynamicCast<CUDABackendWrapper>();
+                                auto offset = chrange[axis].start * output_slice.total(axis + 1, output.dims);
+                                auto new_shape = shape(output_slice);
+                                cuda_wrapper->update(new_shape, offset);
+                                inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast<BackendWrapper>();
+                            }
+#endif
                              // Layers that refer old input Mat will refer to the
                              // new data but the same Mat object.
                              CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
                          }
+
+#ifdef HAVE_CUDA
+                        if (preferableBackend == DNN_BACKEND_CUDA)
+                        {
+                            for (int i = 0; i < ld.consumers.size(); i++)
+                            {
+                                LayerData& consumer = layers[ld.consumers[i].lid];
+                                for (int j = 0; j < consumer.inputBlobsId.size(); j++)
+                                {
+                                    if (consumer.inputBlobsId[j].lid == ld.id)
+                                    {
+                                        CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data);
+                                        consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+                                        break;
+                                    }
+                                }
+                            }
+                        }
+#endif
                          ld.skip = true;
                          printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
                      }
@@ -2666,6 +3068,15 @@ struct Net::Impl : public detail::NetImplBase
  
          blobManager.reset();
          backendWrappers.clear();
+
+        for(auto& layer : layers)
+        {
+            auto& ld = layer.second;
+            ld.inputBlobsWrappers.clear();
+            ld.outputBlobsWrappers.clear();
+            ld.internalBlobsWrappers.clear();
+        }
+
          // Fake references to input blobs.
          for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
              blobManager.addReference(LayerPin(0, i));
@@ -2862,7 +3273,24 @@ struct Net::Impl : public detail::NetImplBase
              {
                  Ptr<BackendNode> node = it->second;
                  CV_Assert(!node.empty());
-                if (preferableBackend == DNN_BACKEND_HALIDE)
+                if (preferableBackend == DNN_BACKEND_CUDA)
+                {
+                    CV_Assert(haveCUDA());
+
+#ifdef HAVE_CUDA
+                    Ptr<CUDABackendNode> cudaNode = node.dynamicCast<CUDABackendNode>();
+                    CV_Assert(!cudaNode.empty());
+
+                    cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
+
+                    for (auto id : ld.cudaD2HBackgroundTransfers)
+                    {
+                        auto wrapper = ld.outputBlobsWrappers[id].dynamicCast<CUDABackendWrapper>();
+                        wrapper->copyToHostInBackground();
+                    }
+#endif
+                }
+                else if (preferableBackend == DNN_BACKEND_HALIDE)
                  {
                      forwardHalide(ld.outputBlobsWrappers, node);
                  }
@@ -2874,6 +3302,19 @@ struct Net::Impl : public detail::NetImplBase
                  {
                      forwardNgraph(ld.outputBlobsWrappers, node, isAsync);
                  }
+                else if (preferableBackend == DNN_BACKEND_VKCOM)
+                {
+                    try
+                    {
+                        forwardVkCom(ld.outputBlobsWrappers, node);
+                    }
+                    catch (const cv::Exception& e)
+                    {
+                        CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what());
+                        it->second = Ptr<BackendNode>();
+                        forwardLayer(ld);
+                    }
+                }
                  else
                  {
                      CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
@@ -2916,6 +3357,11 @@ struct Net::Impl : public detail::NetImplBase
  
          //forward itself
          forwardLayer(ld);
+
+#ifdef HAVE_CUDA
+        if (preferableBackend == DNN_BACKEND_CUDA)
+            cudaInfo->context.stream.synchronize();
+#endif
      }
  
      void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
@@ -3054,7 +3500,7 @@ struct Net::Impl : public detail::NetImplBase
          LayerData &ld = layers[pin.lid];
          if ((size_t)pin.oid >= ld.outputBlobs.size())
          {
-            CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
+            CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %zu outputs, "
                                             "the #%d was requested", ld.name.c_str(),
                                             ld.outputBlobs.size(), pin.oid));
          }
@@ -3092,7 +3538,7 @@ struct Net::Impl : public detail::NetImplBase
          {
              CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
                                             "the #%d was requested", ld.name.c_str(),
-                                           ld.outputBlobs.size(), pin.oid));
+                                           (int)ld.outputBlobs.size(), (int)pin.oid));
          }
          if (preferableTarget != DNN_TARGET_CPU)
          {
@@ -3910,7 +4356,7 @@ string Net::Impl::dump()
              prevNode = itBackend->second;
          }
      }
-    string colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462"};
+    string colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151"};
      string backend;
      switch (prefBackend)
      {
@@ -3920,6 +4366,8 @@ string Net::Impl::dump()
          case DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019: backend = "DLIE/"; break;
          case DNN_BACKEND_INFERENCE_ENGINE_NGRAPH: backend = "NGRAPH/"; break;
          case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
+        case DNN_BACKEND_VKCOM: backend = "VULKAN/"; break;
+        case DNN_BACKEND_CUDA: backend = "CUDA/"; break;
          // don't use default:
      }
      out << "digraph G {\n";
@@ -4053,7 +4501,10 @@ string Net::Impl::dump()
              case DNN_TARGET_OPENCL: out << "OCL"; colorId = 1; break;
              case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16"; colorId = 2; break;
              case DNN_TARGET_MYRIAD: out << "MYRIAD"; colorId = 3; break;
+            case DNN_TARGET_VULKAN: out << "VULKAN"; colorId = 7; break;
              case DNN_TARGET_FPGA: out << "FPGA"; colorId = 4; break;
+            case DNN_TARGET_CUDA: out << "CUDA"; colorId = 5; break;
+            case DNN_TARGET_CUDA_FP16: out << "CUDA_FP16"; colorId = 6; break;
              // don't use default:
          }
          out << "\\n";  // align center
@@ -4468,6 +4919,23 @@ bool Layer::supportBackend(int backendId)
      return backendId == DNN_BACKEND_OPENCV;
  }
  
+Ptr<BackendNode> Layer::initCUDA(
+    void*,
+    const std::vector<Ptr<BackendWrapper>>&,
+    const std::vector<Ptr<BackendWrapper>>&)
+{
+    CV_Error(Error::StsNotImplemented, "CUDA pipeline of " + type +
+                                       " layers is not defined.");
+    return Ptr<BackendNode>();
+}
+
+Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
+{
+    CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
+                                       " layers is not defined.");
+    return Ptr<BackendNode>();
+}
+
  Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
  {
      CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
@@ -4824,7 +5292,7 @@ BackendWrapper::~BackendWrapper() {}
  
  Net readNet(const String& _model, const String& _config, const String& _framework)
  {
-    String framework = _framework.toLowerCase();
+    String framework = toLowerCase(_framework);
      String model = _model;
      String config = _config;
      const std::string modelExt = model.substr(model.rfind('.') + 1);
@@ -4873,7 +5341,7 @@ Net readNet(const String& _model, const String& _config, const String& _framewor
  Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
              const std::vector<uchar>& bufferConfig)
  {
-    String framework = _framework.toLowerCase();
+    String framework = toLowerCase(_framework);
      if (framework == "caffe")
          return readNetFromCaffe(bufferConfig, bufferModel);
      else if (framework == "tensorflow")
@@ -4908,5 +5376,5 @@ Net readNetFromModelOptimizer(
      );
  }
  
-CV__DNN_EXPERIMENTAL_NS_END
+CV__DNN_INLINE_NS_END
  }} // namespace