Merge remote-tracking branch 'upstream/3.4' into merge-3.4
authorAlexander Alekhin <alexander.a.alekhin@gmail.com>
Fri, 6 Mar 2020 20:00:55 +0000 (20:00 +0000)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Fri, 6 Mar 2020 20:41:30 +0000 (20:41 +0000)
18 files changed:
1  2 
modules/core/include/opencv2/core/cvstd.inl.hpp
modules/core/src/norm.cpp
modules/core/src/ocl.cpp
modules/dnn/src/layers/blank_layer.cpp
modules/dnn/src/layers/const_layer.cpp
modules/dnn/src/layers/flatten_layer.cpp
modules/dnn/src/layers/normalize_bbox_layer.cpp
modules/dnn/src/layers/permute_layer.cpp
modules/dnn/src/layers/pooling_layer.cpp
modules/dnn/src/layers/prior_box_layer.cpp
modules/dnn/src/layers/reorg_layer.cpp
modules/dnn/src/layers/reshape_layer.cpp
modules/dnn/src/layers/resize_layer.cpp
modules/dnn/src/layers/scale_layer.cpp
modules/dnn/src/layers/slice_layer.cpp
modules/dnn/src/onnx/onnx_importer.cpp
modules/dnn/test/test_onnx_importer.cpp
modules/imgcodecs/src/grfmt_jpeg.cpp

@@@ -710,67 -710,58 +710,78 @@@ double cv::norm( InputArray _src, int n
      result;
      result.d = 0;
      NAryMatIterator it(arrays, ptrs);
-     int j, total = (int)it.size, blockSize = total;
-     bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
-             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
-     int isum = 0;
-     int *ibuf = &result.i;
-     AutoBuffer<float> fltbuf_;
-     float* fltbuf = 0;
-     size_t esz = 0;
-     if( blockSum )
-     {
-         esz = src.elemSize();
+     CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");
  
-         if( depth == CV_16F )
-         {
-             blockSize = std::min(blockSize, 1024);
-             fltbuf_.allocate(blockSize);
-             fltbuf = fltbuf_.data();
-         }
-         else
+     if ((normType == NORM_L1 && depth <= CV_16S) ||
+         ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
+     {
+         // special case to handle "integer" overflow in accumulator
+         const size_t esz = src.elemSize();
+         const int total = (int)it.size;
+         const int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
+         const int blockSize = std::min(total, intSumBlockSize);
+         int isum = 0;
+         int count = 0;
+         for (size_t i = 0; i < it.nplanes; i++, ++it)
          {
-             int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-             blockSize = std::min(blockSize, intSumBlockSize);
-             ibuf = &isum;
+             for (int j = 0; j < total; j += blockSize)
+             {
+                 int bsz = std::min(total - j, blockSize);
+                 func(ptrs[0], ptrs[1], (uchar*)&isum, bsz, cn);
+                 count += bsz;
+                 if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
+                 {
+                     result.d += isum;
+                     isum = 0;
+                     count = 0;
+                 }
+                 ptrs[0] += bsz*esz;
+                 if (ptrs[1])
+                     ptrs[1] += bsz;
+             }
          }
      }
-     for( size_t i = 0; i < it.nplanes; i++, ++it )
++    else if (depth == CV_16F)
 +    {
-         for( j = 0; j < total; j += blockSize )
++        const size_t esz = src.elemSize();
++        const int total = (int)it.size;
++        const int blockSize = std::min(total, divUp(1024, cn));
++        AutoBuffer<float, 1024> fltbuf(blockSize);
++        float* data0 = fltbuf.data();
++        for (size_t i = 0; i < it.nplanes; i++, ++it)
 +        {
-             int bsz = std::min(total - j, blockSize);
-             const uchar* data = ptrs[0];
-             if( depth == CV_16F )
-             {
-                 hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
-                 data = (const uchar*)fltbuf;
-             }
-             func( data, ptrs[1], (uchar*)ibuf, bsz, cn );
-             if( blockSum && depth != CV_16F )
++            for (int j = 0; j < total; j += blockSize)
 +            {
-                 result.d += isum;
-                 isum = 0;
++                int bsz = std::min(total - j, blockSize);
++                hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
++                func((uchar*)data0, ptrs[1], (uchar*)&result.d, bsz, cn);
++                ptrs[0] += bsz*esz;
++                if (ptrs[1])
++                    ptrs[1] += bsz;
 +            }
-             ptrs[0] += bsz*esz;
-             if( ptrs[1] )
-                 ptrs[1] += bsz;
++        }
++    }
+     else
+     {
+         // generic implementation
+         for (size_t i = 0; i < it.nplanes; i++, ++it)
+         {
+             func(ptrs[0], ptrs[1], (uchar*)&result, (int)it.size, cn);
          }
      }
  
      if( normType == NORM_INF )
      {
--        if( depth == CV_64F )
-             ;
-         else if( depth == CV_32F )
-             result.d = result.f;
++        if(depth == CV_64F || depth == CV_16F)
+             return result.d;
 -        else if( depth == CV_32F )
++        else if (depth == CV_32F)
+             return result.f;
          else
-             result.d = result.i;
+             return result.i;
      }
      else if( normType == NORM_L2 )
-         result.d = std::sqrt(result.d);
+         return std::sqrt(result.d);
  
      return result.d;
  }
@@@ -1186,70 -1177,59 +1197,82 @@@ double cv::norm( InputArray _src1, Inpu
      result;
      result.d = 0;
      NAryMatIterator it(arrays, ptrs);
-     int j, total = (int)it.size, blockSize = total;
-     bool blockSum = depth == CV_16F || (normType == NORM_L1 && depth <= CV_16S) ||
-             ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S);
-     unsigned isum = 0;
-     unsigned *ibuf = &result.u;
-     AutoBuffer<float> fltbuf_;
-     float* fltbuf = 0;
-     size_t esz = 0;
-     if( blockSum )
-     {
-         esz = src1.elemSize();
+     CV_CheckLT((size_t)it.size, (size_t)INT_MAX, "");
  
-         if( depth == CV_16F )
-         {
-             blockSize = std::min(blockSize, 1024);
-             fltbuf_.allocate(blockSize*2);
-             fltbuf = fltbuf_.data();
-         }
-         else
+     if ((normType == NORM_L1 && depth <= CV_16S) ||
+         ((normType == NORM_L2 || normType == NORM_L2SQR) && depth <= CV_8S))
+     {
+         // special case to handle "integer" overflow in accumulator
+         const size_t esz = src1.elemSize();
+         const int total = (int)it.size;
+         const int intSumBlockSize = normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15);
+         const int blockSize = std::min(total, intSumBlockSize);
+         int isum = 0;
+         int count = 0;
+         for (size_t i = 0; i < it.nplanes; i++, ++it)
          {
-             int intSumBlockSize = (normType == NORM_L1 && depth <= CV_8S ? (1 << 23) : (1 << 15))/cn;
-             blockSize = std::min(blockSize, intSumBlockSize);
-             ibuf = &isum;
+             for (int j = 0; j < total; j += blockSize)
+             {
+                 int bsz = std::min(total - j, blockSize);
+                 func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&isum, bsz, cn);
+                 count += bsz;
+                 if (count + blockSize >= intSumBlockSize || (i+1 >= it.nplanes && j+bsz >= total))
+                 {
+                     result.d += isum;
+                     isum = 0;
+                     count = 0;
+                 }
+                 ptrs[0] += bsz*esz;
+                 ptrs[1] += bsz*esz;
+                 if (ptrs[2])
+                     ptrs[2] += bsz;
+             }
          }
      }
-     for( size_t i = 0; i < it.nplanes; i++, ++it )
++    else if (depth == CV_16F)
 +    {
-         for( j = 0; j < total; j += blockSize )
++        const size_t esz = src1.elemSize();
++        const int total = (int)it.size;
++        const int blockSize = std::min(total, divUp(512, cn));
++        AutoBuffer<float, 1024> fltbuf(blockSize * 2);
++        float* data0 = fltbuf.data();
++        float* data1 = fltbuf.data() + blockSize * cn;
++        for (size_t i = 0; i < it.nplanes; i++, ++it)
 +        {
-             int bsz = std::min(total - j, blockSize);
-             const uchar *data0 = ptrs[0], *data1 = ptrs[1];
-             if( depth == CV_16F )
-             {
-                 hal::cvt16f32f((const float16_t*)ptrs[0], fltbuf, bsz);
-                 hal::cvt16f32f((const float16_t*)ptrs[1], fltbuf + bsz, bsz);
-                 data0 = (const uchar*)fltbuf;
-                 data1 = (const uchar*)(fltbuf + bsz);
-             }
-             func( data0, data1, ptrs[2], (uchar*)ibuf, bsz, cn );
-             if( blockSum && depth != CV_16F )
++            for (int j = 0; j < total; j += blockSize)
 +            {
-                 result.d += isum;
-                 isum = 0;
++                int bsz = std::min(total - j, blockSize);
++                hal::cvt16f32f((const float16_t*)ptrs[0], data0, bsz * cn);
++                hal::cvt16f32f((const float16_t*)ptrs[1], data1, bsz * cn);
++                func((uchar*)data0, (uchar*)data1, ptrs[2], (uchar*)&result.d, bsz, cn);
++                ptrs[0] += bsz*esz;
++                ptrs[1] += bsz*esz;
++                if (ptrs[2])
++                    ptrs[2] += bsz;
 +            }
-             ptrs[0] += bsz*esz;
-             ptrs[1] += bsz*esz;
-             if( ptrs[2] )
-                 ptrs[2] += bsz;
++        }
++    }
+     else
+     {
+         // generic implementation
+         for (size_t i = 0; i < it.nplanes; i++, ++it)
+         {
+             func(ptrs[0], ptrs[1], ptrs[2], (uchar*)&result, (int)it.size, cn);
          }
      }
  
      if( normType == NORM_INF )
      {
--        if( depth == CV_64F )
-             ;
-         else if( depth == CV_32F )
-             result.d = result.f;
++        if (depth == CV_64F || depth == CV_16F)
+             return result.d;
 -        else if( depth == CV_32F )
++        else if (depth == CV_32F)
+             return result.f;
          else
-             result.d = result.u;
+             return result.u;
      }
      else if( normType == NORM_L2 )
-         result.d = std::sqrt(result.d);
+         return std::sqrt(result.d);
  
      return result.d;
  }
Simple merge
@@@ -115,18 -108,6 +115,7 @@@ public
                  inputs[i].copyTo(outputs[i]);
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
      {
          return Ptr<BackendNode>(new InfEngineNgraphNode(blank));
      }
  #endif  // HAVE_DNN_NGRAPH
++
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
++    }
++#endif
++
  };
  
  Ptr<Layer> BlankLayer::create(const LayerParams& params)
@@@ -75,6 -68,6 +75,7 @@@ public
          blobs[0].copyTo(outputs[0]);
      }
  
++
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
      {
                                                             blobs[0].data);
          return Ptr<BackendNode>(new InfEngineNgraphNode(node));
      }
- #endif  // HAVE_DNN_IE_NN_BUILDER_2019
 -#endif  // HAVE_NGRAPH
++#endif  // HAVE_DNN_NGRAPH
++
 +
 +#ifdef HAVE_CUDA
 +    Ptr<BackendNode> initCUDA(
 +        void *context_,
 +        const std::vector<Ptr<BackendWrapper>>& inputs,
 +        const std::vector<Ptr<BackendWrapper>>& outputs
 +    ) override
 +    {
 +        auto context = reinterpret_cast<csl::CSLContext*>(context_);
 +
 +        CV_Assert(blobs.size() == 1);
 +        return make_cuda_node<cuda4dnn::ConstOp>(preferableTarget, std::move(context->stream), blobs[0]);
 +    }
 +#endif
  
  };
  
@@@ -171,18 -164,6 +171,7 @@@ public
          }
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
++
  #ifdef HAVE_DNN_NGRAPH
  virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                      const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
++    }
++#endif
++
++
      int _startAxis;
      int _endAxis;
  };
@@@ -268,33 -261,6 +268,7 @@@ public
          }
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         if(pnorm != 1 && pnorm != 2)
-             CV_Error(Error::StsNotImplemented, "Unsupported normalization mode");
-         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
-         auto input_shape = input_wrapper->getShape();
-         NormalizeConfiguration<float> config;
-         config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
-         config.axis_start = clamp(startAxis, input_shape.size());
-         config.axis_end = clamp(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
-         config.norm = pnorm;
-         config.eps = epsilon;
-         const auto& weightsMat = blobs.empty() ? Mat() : blobs[0];
-         return make_cuda_node<cuda4dnn::NormalizeOp>(preferableTarget, std::move(context->stream), weightsMat, config);
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
++
  #ifdef HAVE_DNN_NGRAPH
      virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                          const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++
++        if(pnorm != 1 && pnorm != 2)
++            CV_Error(Error::StsNotImplemented, "Unsupported normalization mode");
++
++        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
++        auto input_shape = input_wrapper->getShape();
++
++        NormalizeConfiguration<float> config;
++        config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
++        config.axis_start = clamp(startAxis, input_shape.size());
++        config.axis_end = clamp(endAxis, input_shape.size()) + 1; /* +1 because NormalizeOp follows [start, end) convention */
++        config.norm = pnorm;
++        config.eps = epsilon;
++
++        const auto& weightsMat = blobs.empty() ? Mat() : blobs[0];
++        return make_cuda_node<cuda4dnn::NormalizeOp>(preferableTarget, std::move(context->stream), weightsMat, config);
++    }
++#endif
++
++
  private:
      int startAxis, endAxis;
  };
@@@ -381,28 -371,6 +381,7 @@@ public
          }
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         return make_cuda_node<cuda4dnn::PermuteOp>(preferableTarget, std::move(context->stream), _order);
-     }
- #endif
-     virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
-     {
- #ifdef HAVE_VULKAN
-         CV_Assert(!_order.empty());
-         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPermute(_order));
-         return Ptr<BackendNode>(new VkComBackendNode(input, op));
- #endif // HAVE_VULKAN
-         return Ptr<BackendNode>();
-     }
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
++
  #ifdef HAVE_DNN_NGRAPH
      virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                          const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++        return make_cuda_node<cuda4dnn::PermuteOp>(preferableTarget, std::move(context->stream), _order);
++    }
++#endif
++
++
++#ifdef HAVE_VULKAN
++    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
++    {
++        CV_Assert(!_order.empty());
++        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPermute(_order));
++        return Ptr<BackendNode>(new VkComBackendNode(input, op));
++    }
++#endif // HAVE_VULKAN
++
++
      size_t _count;
      std::vector<size_t> _order;
  
@@@ -184,12 -174,8 +184,12 @@@ public
  
      virtual bool supportBackend(int backendId) CV_OVERRIDE
      {
 +        if (backendId == DNN_BACKEND_CUDA)
 +        {
 +            return type == MAX || type == AVE || type == ROI;
 +        }
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
-         else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
          {
              if (computeMaxIdx)
                  return false;
          {
              return !computeMaxIdx && type != STOCHASTIC;
          }
-         else if (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_VKCOM)
 -        else if (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE)
++        if (backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE || backendId == DNN_BACKEND_VKCOM)
          {
              if (kernel_size.size() == 3)
                  return (backendId == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU);
          }
      }
  
- #ifdef HAVE_VULKAN
 +#ifdef HAVE_CUDA
 +    Ptr<BackendNode> initCUDA(
 +        void *context_,
 +        const std::vector<Ptr<BackendWrapper>>& inputs,
 +        const std::vector<Ptr<BackendWrapper>>& outputs
 +    ) override
 +    {
 +        auto context = reinterpret_cast<csl::CSLContext*>(context_);
 +        if (type == ROI)
 +            return make_cuda_node<cuda4dnn::ROIPoolingOp>(preferableTarget, std::move(context->stream), spatialScale);
 +
 +        auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
 +        auto input_shape = input_wrapper->getShape();
 +
 +        /* storing max indices is a special case and we deal with it separately */
 +        if (computeMaxIdx) {
 +            CV_Assert(type == MAX);
 +
 +            cuda4dnn::MaxPoolingConfiguration config;
 +            config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
 +            config.strides.assign(std::begin(strides), std::end(strides));
 +
 +            if (padMode.empty())
 +            {
 +                config.padMode = MaxPoolingConfiguration::PaddingMode::MANUAL;
 +                config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
 +            }
 +            else if (padMode == "VALID")
 +            {
 +                config.padMode = MaxPoolingConfiguration::PaddingMode::VALID;
 +            }
 +            else if (padMode == "SAME")
 +            {
 +                config.padMode = MaxPoolingConfiguration::PaddingMode::SAME;
 +            }
 +            else
 +            {
 +                CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
 +            }
 +
 +            config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
 +
 +            return make_cuda_node<cuda4dnn::MaxPoolingOp>(preferableTarget, std::move(context->stream), config);
 +        }
 +
 +        PoolingConfiguration config;
 +        if (type == MAX)
 +        {
 +            config.poolMode = PoolingConfiguration::PoolingMode::MAX;
 +        }
 +        else if (type == AVE && !avePoolPaddedArea)
 +        {
 +            config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING;
 +        }
 +        else if (type == AVE && avePoolPaddedArea)
 +        {
 +            config.poolMode = PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING;
 +        }
 +        else
 +        {
 +            CV_Error(Error::StsNotImplemented, "Unsupported pooling mode");
 +        }
 +
 +        config.window_size.assign(std::begin(kernel_size), std::end(kernel_size));
 +        config.strides.assign(std::begin(strides), std::end(strides));
 +
 +        if (padMode.empty())
 +        {
 +            config.padMode = PoolingConfiguration::PaddingMode::MANUAL;
 +            config.pads_begin.assign(std::begin(pads_begin), std::end(pads_begin));
 +            config.pads_end.assign(std::begin(pads_end), std::end(pads_end));
 +        }
 +        else if (padMode == "VALID")
 +        {
 +            config.padMode = PoolingConfiguration::PaddingMode::VALID;
 +        }
 +        else if (padMode == "SAME")
 +        {
 +            config.padMode = PoolingConfiguration::PaddingMode::SAME;
 +        }
 +        else
 +        {
 +            CV_Error(Error::StsNotImplemented, padMode + " padding mode not supported by PoolingLayer");
 +        }
 +
 +        if (ceilMode)
 +            config.roundMode = PoolingConfiguration::RoundingMode::CEIL;
 +        else
 +            config.roundMode = PoolingConfiguration::RoundingMode::FLOOR;
 +
 +        config.input_shape.assign(std::begin(input_shape), std::end(input_shape));
 +
 +        return make_cuda_node<cuda4dnn::PoolingOp>(preferableTarget, std::move(context->cudnn_handle), config);
 +    }
 +#endif
 +
++
++#ifdef HAVE_VULKAN
 +    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
 +    {
- #endif
-         return Ptr<BackendNode>();
 +        int padding_mode;
 +        vkcom::PoolType pool_type;
 +        int filter_size[2] = {kernel.height, kernel.width};
 +        int pad_size[2] = {pad.height, pad.width};
 +        int stride_size[2] = {stride.height, stride.width};
 +        pool_type = type == MAX ? vkcom::kPoolTypeMax:
 +                   (type == AVE ? vkcom::kPoolTypeAvg:
 +                            vkcom::kPoolTypeNum);
 +
 +        if (padMode.empty())
 +        {
 +            padding_mode = vkcom::kPaddingModeCaffe;
 +        }
 +        else if (padMode == "VALID")
 +        {
 +            padding_mode = vkcom::kPaddingModeValid;
 +        }
 +        else if (padMode == "SAME")
 +        {
 +            padding_mode = vkcom::kPaddingModeSame;
 +        }
 +        else
 +            CV_Error(Error::StsError, "Unsupported padding mode " + padMode);
 +
 +        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPool(filter_size, pad_size,
 +                                                            stride_size, padding_mode,
 +                                                            pool_type, avePoolPaddedArea));
 +        return Ptr<BackendNode>(new VkComBackendNode(inputs, op));
 +    }
++#endif
++
 +
      virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs) CV_OVERRIDE
      {
          if (type == MAX)
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
  
--
  #ifdef HAVE_DNN_NGRAPH
--virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
--                                    const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
--{
--    CV_Assert_N((inputs.size() == 1 && (type == MAX || type == AVE)) || inputs.size() == 2, nodes.size() == inputs.size());
--    auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
--
--    ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
--    if (!padMode.empty())
--        pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
--
--    auto rounding_type = ceilMode ? ngraph::op::RoundingType::CEIL : ngraph::op::RoundingType::FLOOR;
--    if (type == AVE) {
--        auto exclude_pad = !avePoolPaddedArea;
--        auto ave_pool = std::make_shared<ngraph::op::v1::AvgPool>(ieInpNode, ngraph::Strides(strides),
--                        ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
--                        exclude_pad, rounding_type, pad_type);
--        return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
--    }
--    else if (type == MAX) {
--        auto max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
--                        ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
--                        rounding_type, pad_type);
--        return Ptr<BackendNode>(new InfEngineNgraphNode(max_pool));
--    }
--    else if (type == ROI) {
--        auto& coords = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
--        auto roi = std::make_shared<ngraph::op::ROIPooling>(ieInpNode, coords,
--                   ngraph::Shape{(size_t)pooledSize.height, (size_t)pooledSize.width}, spatialScale, "max");
--        return Ptr<BackendNode>(new InfEngineNgraphNode(roi));
--    }
--    else if (type == PSROI) {
--        auto& coords = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
--        auto psroi = std::make_shared<ngraph::op::PSROIPooling>(ieInpNode, coords,
--                     (size_t)psRoiOutChannels, (size_t)pooledSize.width, spatialScale, 1, 1, "average");
--        return Ptr<BackendNode>(new InfEngineNgraphNode(psroi));
++    virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
++                                        const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
++    {
++        CV_Assert_N((inputs.size() == 1 && (type == MAX || type == AVE)) || inputs.size() == 2, nodes.size() == inputs.size());
++        auto& ieInpNode = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
++
++        ngraph::op::PadType pad_type = ngraph::op::PadType::EXPLICIT;
++        if (!padMode.empty())
++            pad_type = padMode == "VALID" ? ngraph::op::PadType::VALID : ngraph::op::PadType::SAME_UPPER;
++
++        auto rounding_type = ceilMode ? ngraph::op::RoundingType::CEIL : ngraph::op::RoundingType::FLOOR;
++        if (type == AVE) {
++            auto exclude_pad = !avePoolPaddedArea;
++            auto ave_pool = std::make_shared<ngraph::op::v1::AvgPool>(ieInpNode, ngraph::Strides(strides),
++                            ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
++                            exclude_pad, rounding_type, pad_type);
++            return Ptr<BackendNode>(new InfEngineNgraphNode(ave_pool));
++        }
++        else if (type == MAX) {
++            auto max_pool = std::make_shared<ngraph::op::v1::MaxPool>(ieInpNode, ngraph::Strides(strides),
++                            ngraph::Shape(pads_begin), ngraph::Shape(pads_end), ngraph::Shape(kernel_size),
++                            rounding_type, pad_type);
++            return Ptr<BackendNode>(new InfEngineNgraphNode(max_pool));
++        }
++        else if (type == ROI) {
++            auto& coords = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
++            auto roi = std::make_shared<ngraph::op::ROIPooling>(ieInpNode, coords,
++                       ngraph::Shape{(size_t)pooledSize.height, (size_t)pooledSize.width}, spatialScale, "max");
++            return Ptr<BackendNode>(new InfEngineNgraphNode(roi));
++        }
++        else if (type == PSROI) {
++            auto& coords = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
++            auto psroi = std::make_shared<ngraph::op::PSROIPooling>(ieInpNode, coords,
++                         (size_t)psRoiOutChannels, (size_t)pooledSize.width, spatialScale, 1, 1, "average");
++            return Ptr<BackendNode>(new InfEngineNgraphNode(psroi));
++        }
++        else
++            CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
      }
--    else
--        CV_Error(Error::StsNotImplemented, "Unsupported pooling type");
--}
  #endif  // HAVE_DNN_NGRAPH
  
  
@@@ -504,57 -494,6 +504,7 @@@ public
          }
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         auto feature_map_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
-         auto feature_map_shape = feature_map_wrapper->getShape();
-         auto image_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
-         auto image_shape = image_wrapper->getShape();
-         PriorBoxConfiguration config;
-         config.feature_map_width = feature_map_shape.rbegin()[0];
-         config.feature_map_height = feature_map_shape.rbegin()[1];
-         config.image_width = image_shape.rbegin()[0];
-         config.image_height = image_shape.rbegin()[1];
-         config.num_priors = _numPriors;
-         config.box_widths = _boxWidths;
-         config.box_heights = _boxHeights;
-         config.offsets_x = _offsetsX;
-         config.offsets_y = _offsetsY;
-         config.stepX = _stepX;
-         config.stepY = _stepY;
-         config.variance = _variance;
-         config.clip = _clip;
-         config.normalize = _bboxesNormalized;
-         return make_cuda_node<cuda4dnn::PriorBoxOp>(preferableTarget, std::move(context->stream), config);
-     }
- #endif
-     virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
-     {
- #ifdef HAVE_VULKAN
-         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPriorBox(_stepX, _stepY,
-                                                                 _clip, _numPriors,
-                                                                 _variance, _offsetsX,
-                                                                 _offsetsY, _boxWidths,
-                                                                 _boxHeights));
-         return Ptr<BackendNode>(new VkComBackendNode(input, op));
- #endif // HAVE_VULKAN
-         return Ptr<BackendNode>();
-     }
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
++
  #ifdef HAVE_DNN_NGRAPH
      virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      {
  #endif  // HAVE_DNN_NGRAPH
  
  
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++
++        auto feature_map_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
++        auto feature_map_shape = feature_map_wrapper->getShape();
++
++        auto image_wrapper = inputs[1].dynamicCast<CUDABackendWrapper>();
++        auto image_shape = image_wrapper->getShape();
++
++        PriorBoxConfiguration config;
++        config.feature_map_width = feature_map_shape.rbegin()[0];
++        config.feature_map_height = feature_map_shape.rbegin()[1];
++        config.image_width = image_shape.rbegin()[0];
++        config.image_height = image_shape.rbegin()[1];
++
++        config.num_priors = _numPriors;
++        config.box_widths = _boxWidths;
++        config.box_heights = _boxHeights;
++        config.offsets_x = _offsetsX;
++        config.offsets_y = _offsetsY;
++        config.stepX = _stepX;
++        config.stepY = _stepY;
++
++        config.variance = _variance;
++
++        config.clip = _clip;
++        config.normalize = _bboxesNormalized;
++
++        return make_cuda_node<cuda4dnn::PriorBoxOp>(preferableTarget, std::move(context->stream), config);
++    }
++#endif
++
++
++#ifdef HAVE_VULKAN
++    virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
++    {
++        std::shared_ptr<vkcom::OpBase> op(new vkcom::OpPriorBox(_stepX, _stepY,
++                                                                _clip, _numPriors,
++                                                                _variance, _offsetsX,
++                                                                _offsetsY, _boxWidths,
++                                                                _boxHeights));
++        return Ptr<BackendNode>(new VkComBackendNode(input, op));
++    }
++#endif // HAVE_VULKAN
++
++
      virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                             const std::vector<MatShape> &outputs) const CV_OVERRIDE
      {
@@@ -193,18 -185,6 +193,7 @@@ public
          permute->forward(inputs, outputs, internals_arr);
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         return make_cuda_node<cuda4dnn::ReorgOp>(preferableTarget, std::move(context->stream), reorgStride);
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
++
  #ifdef HAVE_DNN_NGRAPH
      virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> > &inputs,
                                          const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++        return make_cuda_node<cuda4dnn::ReorgOp>(preferableTarget, std::move(context->stream), reorgStride);
++    }
++#endif
++
++
      virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
                             const std::vector<MatShape> &outputs) const CV_OVERRIDE
      {
@@@ -267,18 -260,6 +267,7 @@@ public
          }
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_IE_NN_BUILDER_2019
  
++
  #ifdef HAVE_DNN_NGRAPH
      virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                          const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++        return make_cuda_node<cuda4dnn::ReshapeOp>(preferableTarget, std::move(context->stream));
++    }
++#endif
++
++
  private:
      std::vector<MatShape> outShapes;
  };
@@@ -170,27 -161,6 +170,7 @@@ public
              CV_Error(Error::StsNotImplemented, "Unknown interpolation: " + interpolation);
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         cuda4dnn::InterpolationType itype;
-         if (interpolation == "nearest")
-             itype = InterpolationType::NEAREST_NEIGHBOUR;
-         else if (interpolation == "bilinear")
-             itype = InterpolationType::BILINEAR;
-         else
-             CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer.");
-         return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), itype, scaleHeight, scaleWidth);
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
      {
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++
++        cuda4dnn::InterpolationType itype;
++        if (interpolation == "nearest")
++            itype = InterpolationType::NEAREST_NEIGHBOUR;
++        else if (interpolation == "bilinear")
++            itype = InterpolationType::BILINEAR;
++        else
++            CV_Error(Error::StsNotImplemented, "Requested interpolation mode is not available in resize layer.");
++
++        return make_cuda_node<cuda4dnn::ResizeOp>(preferableTarget, std::move(context->stream), itype, scaleHeight, scaleWidth);
++    }
++#endif
++
++
  protected:
      int outWidth, outHeight;
      const int zoomFactorWidth, zoomFactorHeight;
@@@ -273,28 -266,6 +273,7 @@@ public
          }
      }
  
- #ifdef HAVE_CUDA
-     Ptr<BackendNode> initCUDA(
-         void *context_,
-         const std::vector<Ptr<BackendWrapper>>& inputs,
-         const std::vector<Ptr<BackendWrapper>>& outputs
-     ) override
-     {
-         auto context = reinterpret_cast<csl::CSLContext*>(context_);
-         std::vector<std::vector<std::size_t>> offsets;
-         for (const auto& ranges : sliceRanges)
-         {
-             std::vector<std::size_t> offsets_i;
-             for (const auto& range : ranges)
-                 offsets_i.push_back(range.start);
-             offsets.push_back(std::move(offsets_i));
-         }
-         return make_cuda_node<cuda4dnn::SliceOp>(preferableTarget, std::move(context->stream), std::move(offsets));
-     }
- #endif
 +
  #ifdef HAVE_DNN_IE_NN_BUILDER_2019
  #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
      virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
  #endif
  #endif
  
++
  #ifdef HAVE_DNN_NGRAPH
      virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
                                          const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
      }
  #endif  // HAVE_DNN_NGRAPH
  
++
++#ifdef HAVE_CUDA
++    Ptr<BackendNode> initCUDA(
++        void *context_,
++        const std::vector<Ptr<BackendWrapper>>& inputs,
++        const std::vector<Ptr<BackendWrapper>>& outputs
++    ) override
++    {
++        auto context = reinterpret_cast<csl::CSLContext*>(context_);
++
++        std::vector<std::vector<std::size_t>> offsets;
++        for (const auto& ranges : sliceRanges)
++        {
++            std::vector<std::size_t> offsets_i;
++            for (const auto& range : ranges)
++                offsets_i.push_back(range.start);
++            offsets.push_back(std::move(offsets_i));
++        }
++
++        return make_cuda_node<cuda4dnn::SliceOp>(preferableTarget, std::move(context->stream), std::move(offsets));
++    }
++#endif
++
  };
  
  class CropLayerImpl CV_FINAL : public SliceLayerImpl
Simple merge