Merge remote-tracking branch 'upstream/3.4' into merge-3.4
[platform/upstream/opencv.git] / modules / dnn / src / dnn.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include "op_halide.hpp"
44 #include "op_inf_engine.hpp"
45 #include "ie_ngraph.hpp"
46 #include "op_vkcom.hpp"
47 #include "op_cuda.hpp"
48
49 #ifdef HAVE_CUDA
50 #include "cuda4dnn/init.hpp"
51 #include "cuda4dnn/primitives/eltwise.hpp" // required by fuseLayers
52 #endif
53
54 #include "halide_scheduler.hpp"
55
56 #include <set>
57 #include <algorithm>
58 #include <iostream>
59 #include <sstream>
60 #include <fstream>
61 #include <iterator>
62 #include <numeric>
63 #include <memory>
64 #include <opencv2/dnn/shape_utils.hpp>
65 #include <opencv2/imgproc.hpp>
66
67 #include <opencv2/core/utils/configuration.private.hpp>
68 #include <opencv2/core/utils/logger.hpp>
69
70 namespace cv {
71 namespace dnn {
72 CV__DNN_INLINE_NS_BEGIN
73
74 static size_t DNN_NETWORK_DUMP = utils::getConfigurationParameterSizeT("OPENCV_DNN_NETWORK_DUMP", 0);
75
76 // this option is useful to run valgrind memory errors detection
77 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
78
79 #ifdef HAVE_OPENCL
80 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
81 #endif
82
83 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
84 #ifdef HAVE_INF_ENGINE
85     (size_t)DNN_BACKEND_INFERENCE_ENGINE
86 #else
87     (size_t)DNN_BACKEND_OPENCV
88 #endif
89 );
90
91 // Additional checks (slowdowns execution!)
92 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
93 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
94 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
95
96 using std::vector;
97 using std::map;
98 using std::make_pair;
99 using std::set;
100 using std::string;
101
102 //==================================================================================================
103
104 class BackendRegistry
105 {
106 public:
107     typedef std::vector< std::pair<Backend, Target> > BackendsList;
108     const BackendsList & getBackends() const { return backends; }
109     static BackendRegistry & getRegistry()
110     {
111         static BackendRegistry impl;
112         return impl;
113     }
114
115 #ifdef HAVE_INF_ENGINE
116     static inline bool checkIETarget(Target target)
117     {
118 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R3)
119         // Lightweight detection
120         const std::vector<std::string> devices = getCore("").GetAvailableDevices();
121         for (std::vector<std::string>::const_iterator i = devices.begin(); i != devices.end(); ++i)
122         {
123             if (std::string::npos != i->find("MYRIAD") && target == DNN_TARGET_MYRIAD)
124                 return true;
125             else if (std::string::npos != i->find("FPGA") && target == DNN_TARGET_FPGA)
126                 return true;
127             else if (std::string::npos != i->find("CPU") && target == DNN_TARGET_CPU)
128                 return true;
129             else if (std::string::npos != i->find("GPU") && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16))
130                 return true;
131         }
132         return false;
133 #else
134         cv::dnn::Net net;
135         cv::dnn::LayerParams lp;
136         lp.set("kernel_size", 1);
137         lp.set("num_output", 1);
138         lp.set("bias_term", false);
139         lp.type = "Convolution";
140         lp.name = "testLayer";
141         lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
142         net.addLayerToPrev(lp.name, lp.type, lp);
143         net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
144         net.setPreferableTarget(target);
145         static int inpDims[] = {1, 2, 3, 4};
146         net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
147         try
148         {
149             net.forward();
150         }
151         catch(const std::exception& e)
152         {
153             CV_LOG_INFO(NULL, "checkIETarget(" << (int)target << ") has failed with message: " << e.what());
154             return false;
155         }
156         return true;
157 #endif
158     }
159 #endif
160
161 private:
162     BackendRegistry()
163     {
164 #ifdef HAVE_HALIDE
165         backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_CPU));
166 #  ifdef HAVE_OPENCL
167         if (cv::ocl::useOpenCL())
168             backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL));
169 #  endif
170 #endif // HAVE_HALIDE
171
172 #ifdef HAVE_INF_ENGINE
173         if (checkIETarget(DNN_TARGET_CPU)) {
174 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
175             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, DNN_TARGET_CPU));
176 #endif
177 #ifdef HAVE_DNN_NGRAPH
178             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_CPU));
179 #endif
180         }
181         if (checkIETarget(DNN_TARGET_MYRIAD)) {
182 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
183             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, DNN_TARGET_MYRIAD));
184 #endif
185 #ifdef HAVE_DNN_NGRAPH
186             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_MYRIAD));
187 #endif
188         }
189 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
190         if (checkIETarget(DNN_TARGET_FPGA))
191             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, DNN_TARGET_FPGA));
192 #endif
193 #ifdef HAVE_OPENCL
194         if (cv::ocl::useOpenCL() && ocl::Device::getDefault().isIntel())
195         {
196             if (checkIETarget(DNN_TARGET_OPENCL)) {
197 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
198                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, DNN_TARGET_OPENCL));
199 #endif
200 #ifdef HAVE_DNN_NGRAPH
201                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_OPENCL));
202 #endif
203             }
204             if (checkIETarget(DNN_TARGET_OPENCL_FP16)) {
205 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
206                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, DNN_TARGET_OPENCL_FP16));
207 #endif
208 #ifdef HAVE_DNN_NGRAPH
209                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, DNN_TARGET_OPENCL_FP16));
210 #endif
211             }
212         }
213 #endif
214 #endif // HAVE_INF_ENGINE
215
216 #ifdef HAVE_OPENCL
217         if (cv::ocl::useOpenCL())
218         {
219             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL));
220             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16));
221         }
222 #endif
223
224         backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
225
226 #ifdef HAVE_VULKAN
227         if (haveVulkan())
228             backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
229 #endif
230
231 #ifdef HAVE_CUDA
232         if (haveCUDA() && cuda4dnn::isDeviceCompatible())
233         {
234             backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
235             if (cuda4dnn::doesDeviceSupportFP16())
236                 backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
237         }
238 #endif
239     }
240
241     BackendsList backends;
242 };
243
244
245 std::vector< std::pair<Backend, Target> > getAvailableBackends()
246 {
247     return BackendRegistry::getRegistry().getBackends();
248 }
249
250 std::vector<Target> getAvailableTargets(Backend be)
251 {
252     if (be == DNN_BACKEND_DEFAULT)
253         be = (Backend)PARAM_DNN_BACKEND_DEFAULT;
254 #ifdef HAVE_INF_ENGINE
255     if (be == DNN_BACKEND_INFERENCE_ENGINE)
256         be = getInferenceEngineBackendTypeParam();
257 #endif
258
259     std::vector<Target> result;
260     const BackendRegistry::BackendsList all_backends = getAvailableBackends();
261     for(BackendRegistry::BackendsList::const_iterator i = all_backends.begin(); i != all_backends.end(); ++i )
262     {
263         if (i->first == be)
264             result.push_back(i->second);
265     }
266     return result;
267 }
268
269 //==================================================================================================
270
271 namespace
272 {
273     typedef std::vector<MatShape> ShapesVec;
274
275     struct LayerShapes
276     {
277         ShapesVec in, out, internal;
278         // No guarantees that layer which support in-place computations
279         // will be computed in-place (input.data_ptr == output.data_ptr).
280         // If layer said that it could work in-place and layers after it
281         // no longer use input blob, we'll set output = input.
282         bool supportInPlace;
283         LayerShapes() {supportInPlace = false;}
284     };
285 }
286
287 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
288                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
289 {
290     CV_TRACE_FUNCTION();
291     Mat blob;
292     blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
293     return blob;
294 }
295
296 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
297                    const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
298 {
299     CV_TRACE_FUNCTION();
300     std::vector<Mat> images(1, image.getMat());
301     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
302 }
303
304 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
305                    const Scalar& mean, bool swapRB, bool crop, int ddepth)
306 {
307     CV_TRACE_FUNCTION();
308     Mat blob;
309     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
310     return blob;
311 }
312
313 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
314                     Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
315 {
316     CV_TRACE_FUNCTION();
317     CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
318     if (ddepth == CV_8U)
319     {
320         CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
321         CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
322     }
323
324     std::vector<Mat> images;
325     images_.getMatVector(images);
326     CV_Assert(!images.empty());
327     for (size_t i = 0; i < images.size(); i++)
328     {
329         Size imgSize = images[i].size();
330         if (size == Size())
331             size = imgSize;
332         if (size != imgSize)
333         {
334             if(crop)
335             {
336               float resizeFactor = std::max(size.width / (float)imgSize.width,
337                                             size.height / (float)imgSize.height);
338               resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
339               Rect crop(Point(0.5 * (images[i].cols - size.width),
340                               0.5 * (images[i].rows - size.height)),
341                         size);
342               images[i] = images[i](crop);
343             }
344             else
345               resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
346         }
347         if(images[i].depth() == CV_8U && ddepth == CV_32F)
348             images[i].convertTo(images[i], CV_32F);
349         Scalar mean = mean_;
350         if (swapRB)
351             std::swap(mean[0], mean[2]);
352
353         images[i] -= mean;
354         images[i] *= scalefactor;
355     }
356
357     size_t nimages = images.size();
358     Mat image0 = images[0];
359     int nch = image0.channels();
360     CV_Assert(image0.dims == 2);
361     if (nch == 3 || nch == 4)
362     {
363         int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
364         blob_.create(4, sz, ddepth);
365         Mat blob = blob_.getMat();
366         Mat ch[4];
367
368         for(size_t i = 0; i < nimages; i++ )
369         {
370             const Mat& image = images[i];
371             CV_Assert(image.depth() == blob_.depth());
372             nch = image.channels();
373             CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
374             CV_Assert(image.size() == image0.size());
375
376             for( int j = 0; j < nch; j++ )
377                 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
378             if(swapRB)
379                 std::swap(ch[0], ch[2]);
380             split(image, ch);
381         }
382     }
383     else
384     {
385        CV_Assert(nch == 1);
386        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
387        blob_.create(4, sz, ddepth);
388        Mat blob = blob_.getMat();
389
390        for(size_t i = 0; i < nimages; i++ )
391        {
392            const Mat& image = images[i];
393            CV_Assert(image.depth() == blob_.depth());
394            nch = image.channels();
395            CV_Assert(image.dims == 2 && (nch == 1));
396            CV_Assert(image.size() == image0.size());
397
398            image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
399        }
400     }
401 }
402
403 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
404 {
405     CV_TRACE_FUNCTION();
406
407     //A blob is a 4 dimensional matrix in floating point precision
408     //blob_[0] = batchSize = nbOfImages
409     //blob_[1] = nbOfChannels
410     //blob_[2] = height
411     //blob_[3] = width
412     CV_Assert(blob_.depth() == CV_32F);
413     CV_Assert(blob_.dims == 4);
414
415     images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
416
417     std::vector<Mat> vectorOfChannels(blob_.size[1]);
418     for (int n = 0; n <  blob_.size[0]; ++n)
419     {
420         for (int c = 0; c < blob_.size[1]; ++c)
421         {
422             vectorOfChannels[c] = getPlane(blob_, n, c);
423         }
424         cv::merge(vectorOfChannels, images_.getMatRef(n));
425     }
426 }
427
428 #ifdef HAVE_OPENCL
429 class OpenCLBackendWrapper : public BackendWrapper
430 {
431 public:
432     OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
433     {
434         m.copyTo(umat);
435         host = &m;
436         hostDirty = false;
437     }
438
439     OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
440         : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
441     {
442         Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
443         CV_Assert(!base.empty());
444
445         host = &m;
446
447         int shape[] = {1, (int)base->umat.total()};
448         umat = base->umat.reshape(1, 2, &shape[0])
449                          .colRange(0, host->total())
450                          .reshape(1, host->dims, &host->size[0]);
451         hostDirty = false;
452     }
453
454     static Ptr<BackendWrapper> create(Mat& m)
455     {
456         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
457     }
458
459     static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
460     {
461         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
462     }
463
464     static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
465     {
466         const int numWrappers = wrappers.size();
467         std::vector<UMat> mats(wrappers.size());
468         for (int i = 0; i < numWrappers; ++i)
469         {
470             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
471             CV_Assert(!umatWrapper.empty());
472             umatWrapper->copyToDevice();
473             mats[i] = umatWrapper->umat;
474         }
475         return mats;
476     }
477
478     // Replaces all umats in wrappers to specific ones.
479     static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
480                        const std::vector<UMat>& umats)
481     {
482         CV_Assert(wrappers.size() == umats.size());
483         for (int i = 0, n = umats.size(); i < n; ++i)
484         {
485             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
486             CV_Assert(!umatWrapper.empty());
487             umatWrapper->umat = umats[i];
488         }
489     }
490
491     ~OpenCLBackendWrapper() {}
492
493     // Copies data from device to a host memory.
494     virtual void copyToHost() CV_OVERRIDE
495     {
496         umat.copyTo(*host);
497     }
498
499     virtual void setHostDirty() CV_OVERRIDE
500     {
501         hostDirty = true;
502     };
503
504     void copyToDevice()
505     {
506         if (hostDirty)
507         {
508             host->copyTo(umat);
509             hostDirty = false;
510         }
511     }
512
513 private:
514     UMat umat;
515     Mat* host;
516     bool hostDirty;
517 };
518 #endif
519
520 struct LayerPin
521 {
522     int lid;
523     int oid;
524
525     LayerPin(int layerId = -1, int outputId = -1)
526         : lid(layerId), oid(outputId) {}
527
528     bool valid() const
529     {
530         return (lid >= 0 && oid >= 0);
531     }
532
533     bool equal(const LayerPin &r) const
534     {
535         return (lid == r.lid && oid == r.oid);
536     }
537
538     bool operator<(const LayerPin &r) const
539     {
540         return lid < r.lid || (lid == r.lid && oid < r.oid);
541     }
542
543     bool operator ==(const LayerPin &r) const
544     {
545         return lid == r.lid && oid == r.oid;
546     }
547 };
548
549 struct LayerData
550 {
551     LayerData() : id(-1), skip(false), flag(0) {}
552     LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
553         : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
554     {
555         CV_TRACE_FUNCTION();
556
557         //add logging info
558         params.name = name;
559         params.type = type;
560     }
561
562     int id;
563     String name;
564     String type;
565     LayerParams params;
566
567     std::vector<LayerPin> inputBlobsId;
568     std::set<int> inputLayersId;
569     std::set<int> requiredOutputs;
570     std::vector<LayerPin> consumers;
571     std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
572     std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
573     std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
574
575 #ifdef HAVE_CUDA
576     /* output ids which must be transferred to the host in the background
577      * after the completion of the forward pass of the layer
578      */
579     std::vector<int> cudaD2HBackgroundTransfers;
580 #endif
581
582     Ptr<Layer> layerInstance;
583     std::vector<Mat> outputBlobs;
584     std::vector<Mat*> inputBlobs;
585     std::vector<Mat> internals;
586     // Computation nodes of implemented backends (except DEFAULT).
587     std::map<int, Ptr<BackendNode> > backendNodes;
588     // Flag for skip layer computation for specific backend.
589     bool skip;
590
591     int flag;
592
593     Ptr<Layer> getLayerInstance()
594     {
595         CV_TRACE_FUNCTION();
596         CV_TRACE_ARG_VALUE(type, "type", type.c_str());
597
598         if (layerInstance)
599             return layerInstance;
600
601         layerInstance = LayerFactory::createLayerInstance(type, params);
602         if (!layerInstance)
603         {
604             CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
605         }
606
607         return layerInstance;
608     }
609 };
610
611 //fake layer containing network input blobs
612 struct DataLayer : public Layer
613 {
614     DataLayer() : Layer()
615     {
616         skip = false;
617     }
618
619     virtual bool supportBackend(int backendId) CV_OVERRIDE
620     {
621         return backendId == DNN_BACKEND_OPENCV ||
622                (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && inputsData.size() == 1);
623     }
624
625     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
626     {
627         CV_TRACE_FUNCTION();
628         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
629
630         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
631                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
632
633         if (outputs_arr.depth() == CV_16S)
634         {
635             forward_fallback(inputs_arr, outputs_arr, internals_arr);
636             return;
637         }
638
639         std::vector<Mat> outputs, internals;
640         outputs_arr.getMatVector(outputs);
641         internals_arr.getMatVector(internals);
642
643         // Supported modes:
644         // | Input type | Output type |
645         // |       fp32 |        fp32 |
646         // |      uint8 |        fp32 |
647         for (int i = 0; i < inputsData.size(); ++i)
648         {
649             double scale = scaleFactors[i];
650             Scalar& mean = means[i];
651             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
652             CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
653
654             bool singleMean = true;
655             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
656             {
657                 singleMean = mean[j] == mean[j - 1];
658             }
659
660             if (singleMean)
661             {
662                 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
663             }
664             else
665             {
666                 for (int n = 0; n < inputsData[i].size[0]; ++n)
667                     for (int c = 0; c < inputsData[i].size[1]; ++c)
668                     {
669                         Mat inp = getPlane(inputsData[i], n, c);
670                         Mat out = getPlane(outputs[i], n, c);
671                         inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
672                     }
673             }
674         }
675     }
676
677 #ifdef HAVE_OPENCL
678     std::vector<Mat> tmp_expressions;
679     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
680     {
681         // Supported modes:
682         // | Input type | Output type |
683         // |       fp32 |        fp32 |
684         // |       fp32 |        fp16 |
685         // |      uint8 |        fp32 |
686         std::vector<UMat> outputs;
687         outputs_.getUMatVector(outputs);
688
689         tmp_expressions.clear();
690         for (int i = 0; i < inputsData.size(); ++i)
691         {
692             Mat inputData = inputsData[i];
693
694             double scale = scaleFactors[i];
695             Scalar& mean = means[i];
696
697             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
698             bool singleMean = true;
699             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
700             {
701                 singleMean = mean[j] == mean[j - 1];
702             }
703
704             if (outputs_.depth() == CV_16S)
705             {
706                 if (singleMean)
707                 {
708                     tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
709                     convertFp16(tmp_expressions.back(), outputs[i]);
710                 }
711                 else
712                 {
713                     for (int n = 0; n < inputsData[i].size[0]; ++n)
714                         for (int c = 0; c < inputsData[i].size[1]; ++c)
715                         {
716                             Mat inp = getPlane(inputsData[i], n, c);
717
718                             std::vector<cv::Range> plane(4, Range::all());
719                             plane[0] = Range(n, n + 1);
720                             plane[1] = Range(c, c + 1);
721                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
722
723                             tmp_expressions.push_back(scale * (inp - mean[c]));
724                             convertFp16(tmp_expressions.back(), out);
725                         }
726                 }
727             }
728             else
729             {
730                 CV_Assert(outputs_.depth() == CV_32F);
731                 if (singleMean)
732                 {
733                     inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
734                 }
735                 else
736                 {
737                     for (int n = 0; n < inputsData[i].size[0]; ++n)
738                         for (int c = 0; c < inputsData[i].size[1]; ++c)
739                         {
740                             Mat inp = getPlane(inputsData[i], n, c);
741
742                             std::vector<cv::Range> plane(4, Range::all());
743                             plane[0] = Range(n, n + 1);
744                             plane[1] = Range(c, c + 1);
745                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
746
747                             inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
748                         }
749                 }
750             }
751         }
752         return true;
753     }
754 #endif
755
756     int outputNameToIndex(const String& tgtName) CV_OVERRIDE
757     {
758         int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
759         return (idx < (int)outNames.size()) ? idx : -1;
760     }
761
762     void setNames(const std::vector<String> &names)
763     {
764         outNames.assign(names.begin(), names.end());
765         shapes.clear(); shapes.resize(outNames.size());
766     }
767
768     void setInputShape(const String& tgtName, const MatShape& shape)
769     {
770         std::vector<String>::const_iterator it = std::find(outNames.begin(), outNames.end(), tgtName);
771         CV_Check(tgtName, it != outNames.end(), "Unknown input");
772         int idx = (int)(it - outNames.begin());
773
774         CV_Assert(idx < (int)shapes.size());
775         CV_Check(tgtName, shapes[idx].empty(), "Input shape redefinition is not allowed");
776         shapes[idx] = shape;
777     }
778
779     bool getMemoryShapes(const std::vector<MatShape> &inputs,
780                          const int requiredOutputs,
781                          std::vector<MatShape> &outputs,
782                          std::vector<MatShape> &internals) const CV_OVERRIDE
783     {
784         CV_Assert(inputs.size() == requiredOutputs);
785         outputs.assign(inputs.begin(), inputs.end());
786         return false;
787     }
788
789     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
790     {
791         std::vector<Mat> outputs;
792         outputs_arr.getMatVector(outputs);
793
794         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
795                   inputsData.size() == outputs.size());
796         skip = true;
797         for (int i = 0; skip && i < inputsData.size(); ++i)
798         {
799             if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
800                 skip = false;
801         }
802     }
803
804 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
805     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
806     {
807         CV_CheckEQ(inputsData.size(), (size_t)1, "");
808         CV_CheckEQ(inputsData[0].dims, 4, "");
809         const size_t numChannels = inputsData[0].size[1];
810         CV_Assert(numChannels <= 4);
811
812         // Scale
813         InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
814                                        InferenceEngine::Layout::C);
815         auto weights = InferenceEngine::make_shared_blob<float>(td);
816         weights->allocate();
817
818         float* weight_buf = weights->buffer().as<float*>();
819         std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
820
821         // Mean subtraction
822         auto biases = InferenceEngine::make_shared_blob<float>(td);
823         biases->allocate();
824         float* bias_buf = biases->buffer().as<float*>();
825
826         for (int i = 0; i < numChannels; ++i)
827         {
828             bias_buf[i] = -means[0][i] * scaleFactors[0];
829         }
830
831         InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
832         addConstantData("weights", weights, ieLayer);
833         addConstantData("biases", biases, ieLayer);
834         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
835     }
836 #endif  // HAVE_DNN_IE_NN_BUILDER_2019
837
838     std::vector<String> outNames;
839     std::vector<MatShape> shapes;
840     // Preprocessing parameters for each network's input.
841     std::vector<double> scaleFactors;
842     std::vector<Scalar> means;
843     std::vector<Mat> inputsData;
844     bool skip;
845 };
846
847 struct BlobManager
848 {
849 public:
850     // Increase references counter to layer output.
851     void addReference(const LayerPin& lp)
852     {
853         std::map<LayerPin, int>::iterator it = refCounter.find(lp);
854         if (it == refCounter.end())
855             refCounter[lp] = 1;
856         else
857             it->second += 1;
858     }
859
860     void addReferences(const std::vector<LayerPin>& pins)
861     {
862         for (int i = 0; i < pins.size(); i++)
863         {
864             addReference(pins[i]);
865         }
866     }
867
868     // Returns number of references to allocated memory that used in specific
869     // layer blob.
870     int numReferences(const LayerPin& lp)
871     {
872         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
873         CV_Assert(mapIt != reuseMap.end());
874         LayerPin memHost = mapIt->second;
875
876         std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
877         CV_Assert(refIt != refCounter.end());
878         return refIt->second;
879     }
880
881     // Reuse data allocated in <host> inside the <user> blob.
882     void reuse(const LayerPin& host, const LayerPin& user)
883     {
884         CV_Assert(reuseMap.find(user) == reuseMap.end());
885         CV_Assert(reuseMap.find(host) != reuseMap.end());
886         LayerPin memHost = reuseMap[host];
887         reuseMap[user] = memHost;
888         if (refCounter.find(memHost) != refCounter.end())
889         {
890             std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
891             if (userRefIt != refCounter.end())
892             {
893                 refCounter[memHost] += userRefIt->second;
894                 refCounter.erase(userRefIt);
895             }
896             else
897                 refCounter[memHost] += 1;
898         }
899     }
900
901     // Decrease references counter to allocated memory inside specific blob.
902     void releaseReference(const LayerPin& lp)
903     {
904         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
905         CV_Assert(mapIt != reuseMap.end());
906
907         std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
908         CV_Assert(refIt != refCounter.end());
909         CV_Assert(refIt->second > 0);
910         refIt->second -= 1;
911     }
912
913     void releaseReferences(const std::vector<LayerPin>& pins)
914     {
915         for (int i = 0; i < pins.size(); i++)
916         {
917             releaseReference(pins[i]);
918         }
919     }
920
921     void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
922     {
923         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
924         {
925             Mat bestBlob;
926             LayerPin bestBlobPin;
927
928             std::map<LayerPin, Mat>::iterator hostIt;
929             std::map<LayerPin, int>::iterator refIt;
930
931             const int targetTotal = total(shape);
932             int bestBlobTotal = INT_MAX;
933
934             for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
935             {
936                 refIt = refCounter.find(hostIt->first);
937                 // Use only blobs that had references before because if not,
938                 // it might be used as output.
939                 if (refIt != refCounter.end() && refIt->second == 0)
940                 {
941                     Mat& unusedBlob = hostIt->second;
942                     if (unusedBlob.total() >= targetTotal &&
943                         unusedBlob.total() < bestBlobTotal)
944                     {
945                         bestBlobPin = hostIt->first;
946                         bestBlob = unusedBlob;
947                         bestBlobTotal = unusedBlob.total();
948                     }
949                 }
950             }
951             if (!bestBlob.empty())
952             {
953                 reuse(bestBlobPin, lp);
954                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
955                 return;
956             }
957         }
958
959         {
960             // if dst already has been allocated with total(shape) elements,
961             // it won't be recreated and pointer of dst.data remains the same.
962             dst.create(shape, use_half ? CV_16S : CV_32F);
963             addHost(lp, dst);
964         }
965     }
966
967     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
968                                std::vector<LayerPin>& pinsForInternalBlobs,
969                                bool use_half = false)
970     {
971         CV_TRACE_FUNCTION();
972
973         pinsForInternalBlobs.clear();
974
975         std::vector<Mat>& outputBlobs = ld.outputBlobs,
976                 &internalBlobs = ld.internals;
977
978         const ShapesVec& outShapes = layerShapes.out,
979                 internalShapes = layerShapes.internal;
980
981         outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
982         internalBlobs.resize(internalShapes.size());
983
984         CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
985
986         // Check that layer could work in-place.
987         bool inPlace = false;
988         if (layerShapes.supportInPlace)
989         {
990             if (ld.inputBlobs.size() == 1)
991             {
992                 // Get number of references to the input memory.
993                 int numRef = numReferences(ld.inputBlobsId[0]);
994                 // If current layer is one and only customer of this blob.
995                 inPlace = numRef == 1;
996             }
997         }
998
999         ShapesVec shapes(outShapes);
1000         shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
1001         std::vector<Mat*> blobs;
1002         for(int i = 0; i < outputBlobs.size(); i++)
1003         {
1004             blobs.push_back(&outputBlobs[i]);
1005         }
1006
1007         for(int i = 0; i < internalBlobs.size(); i++)
1008         {
1009             blobs.push_back(&internalBlobs[i]);
1010             if (total(internalShapes[i]))
1011             {
1012                 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
1013             }
1014         }
1015
1016         addReferences(pinsForInternalBlobs);
1017
1018         std::map<int, std::vector<int> > idxSizes;
1019         for(int i = 0; i < shapes.size(); i++)
1020         {
1021             idxSizes[total(shapes[i])].push_back(i);
1022         }
1023
1024         std::map<int, std::vector<int> >::reverse_iterator it;
1025         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
1026         {
1027             for(int j = 0; j < it->second.size(); j++)
1028             {
1029                 int index = it->second[j];
1030                 if (total(shapes[index]))
1031                 {
1032                     LayerPin blobPin(ld.id, index);
1033                     if (index < outShapes.size() && inPlace)
1034                     {
1035                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
1036                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
1037                         reuse(ld.inputBlobsId[0], blobPin);
1038                     }
1039                     else
1040                         reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
1041                 }
1042             }
1043         }
1044     }
1045
1046     // Clear internal state. Calls before an every reallocation.
1047     void reset()
1048     {
1049         CV_TRACE_FUNCTION();
1050
1051         refCounter.clear();
1052         reuseMap.clear();
1053         memHosts.clear();
1054     }
1055
1056 private:
1057     // Register allocated memory.
1058     void addHost(const LayerPin& lp, const Mat& mat)
1059     {
1060         CV_Assert(memHosts.find(lp) == memHosts.end());
1061         reuseMap[lp] = lp;
1062         memHosts[lp] = mat;
1063     }
1064
1065     std::map<LayerPin, int> refCounter;
1066     // Maps pin to origin blob (for whom memory was allocated firstly).
1067     // For origin blobs key == value.
1068     std::map<LayerPin, LayerPin> reuseMap;
1069     std::map<LayerPin, Mat> memHosts;
1070 };
1071
1072 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
1073 {
1074     if (backendId == DNN_BACKEND_OPENCV)
1075     {
1076         if (targetId == DNN_TARGET_CPU)
1077             return Ptr<BackendWrapper>();
1078 #ifdef HAVE_OPENCL
1079         else if (IS_DNN_OPENCL_TARGET(targetId))
1080             return OpenCLBackendWrapper::create(m);
1081 #endif
1082         else
1083             CV_Error(Error::StsNotImplemented, "Unknown/unsupported target identifier");
1084     }
1085     else if (backendId == DNN_BACKEND_HALIDE)
1086     {
1087         CV_Assert(haveHalide());
1088 #ifdef HAVE_HALIDE
1089         return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
1090 #endif  // HAVE_HALIDE
1091     }
1092     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
1093     {
1094 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
1095         return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
1096 #else
1097         CV_Error(Error::StsNotImplemented, "This OpenCV version is built without Inference Engine NN Builder API support");
1098 #endif
1099     }
1100     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
1101     {
1102 #ifdef HAVE_DNN_NGRAPH
1103         return Ptr<BackendWrapper>(new NgraphBackendWrapper(targetId, m));
1104 #else
1105         CV_Error(Error::StsNotImplemented, "This OpenCV version is built without support of Inference Engine + nGraph");
1106 #endif
1107     }
1108     else if (backendId == DNN_BACKEND_VKCOM)
1109     {
1110         CV_Assert(haveVulkan());
1111 #ifdef HAVE_VULKAN
1112         return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
1113 #endif  // HAVE_VULKAN
1114     }
1115     else if (backendId == DNN_BACKEND_CUDA)
1116     {
1117         CV_Assert(haveCUDA());
1118
1119 #ifdef HAVE_CUDA
1120         switch (targetId)
1121         {
1122         case DNN_TARGET_CUDA:
1123             return CUDABackendWrapperFP32::create(m);
1124         case DNN_TARGET_CUDA_FP16:
1125             return CUDABackendWrapperFP16::create(m);
1126         default:
1127             CV_Assert(IS_DNN_CUDA_TARGET(targetId));
1128         }
1129 #endif
1130     }
1131     else
1132         CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1133     return Ptr<BackendWrapper>();  // TODO Error?
1134 }
1135
1136 static int g_networkId = 0;
1137
1138 detail::NetImplBase::NetImplBase()
1139     : networkId(CV_XADD(&g_networkId, 1))
1140     , networkDumpCounter(0)
1141     , dumpLevel(DNN_NETWORK_DUMP)
1142 {
1143     // nothing
1144 }
1145
1146 std::string detail::NetImplBase::getDumpFileNameBase()
1147 {
1148     std::string dumpFileNameBase = cv::format("ocv_dnn_net_%05d_%02d", networkId, networkDumpCounter++);
1149     return dumpFileNameBase;
1150 }
1151
1152 struct Net::Impl : public detail::NetImplBase
1153 {
1154     typedef std::map<int, LayerShapes> LayersShapesMap;
1155     typedef std::map<int, LayerData> MapIdToLayerData;
1156
1157     Impl()
1158     {
1159         //allocate fake net input layer
1160         netInputLayer = Ptr<DataLayer>(new DataLayer());
1161         LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
1162         inpl.id = 0;
1163         netInputLayer->name = inpl.name = "_input";
1164         inpl.type = "__NetInputLayer__";
1165         inpl.layerInstance = netInputLayer;
1166         layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
1167
1168         lastLayerId = 0;
1169         netWasAllocated = false;
1170         fusion = true;
1171         isAsync = false;
1172         preferableBackend = DNN_BACKEND_DEFAULT;
1173         preferableTarget = DNN_TARGET_CPU;
1174         skipInfEngineInit = false;
1175     }
1176
1177     Ptr<DataLayer> netInputLayer;
1178     std::vector<LayerPin> blobsToKeep;
1179     MapIdToLayerData layers;
1180     std::map<String, int> layerNameToId;
1181     BlobManager blobManager;
1182     int preferableBackend;
1183     int preferableTarget;
1184     String halideConfigFile;
1185     bool skipInfEngineInit;
1186     // Map host data to backend specific wrapper.
1187     std::map<void*, Ptr<BackendWrapper> > backendWrappers;
1188
1189     int lastLayerId;
1190
1191     bool netWasAllocated;
1192     bool fusion;
1193     bool isAsync;
1194     std::vector<int64> layersTimings;
1195     Mat output_blob;
1196
1197 #ifdef HAVE_CUDA
1198     struct CudaInfo_t
1199     {
1200         CudaInfo_t(cuda4dnn::csl::CSLContext ctxt, cuda4dnn::csl::Stream d2h_stream_)
1201          : context(std::move(ctxt)), d2h_stream(std::move(d2h_stream_)) { }
1202         cuda4dnn::csl::CSLContext context;
1203         cuda4dnn::csl::Stream d2h_stream;
1204         cuda4dnn::csl::Workspace workspace;
1205     };
1206
1207     std::unique_ptr<CudaInfo_t> cudaInfo;
1208 #endif
1209
1210     Ptr<BackendWrapper> wrap(Mat& host)
1211     {
1212         if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
1213             return Ptr<BackendWrapper>();
1214
1215         MatShape shape(host.dims);
1216         for (int i = 0; i < host.dims; ++i)
1217             shape[i] = host.size[i];
1218
1219         void* data = host.data;
1220         if (backendWrappers.find(data) != backendWrappers.end())
1221         {
1222             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
1223             if (preferableBackend == DNN_BACKEND_OPENCV)
1224             {
1225 #ifdef HAVE_OPENCL
1226                 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
1227                 return OpenCLBackendWrapper::create(baseBuffer, host);
1228 #else
1229                 CV_Error(Error::StsInternal, "");
1230 #endif
1231             }
1232             else if (preferableBackend == DNN_BACKEND_HALIDE)
1233             {
1234                 CV_Assert(haveHalide());
1235 #ifdef HAVE_HALIDE
1236                 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
1237 #endif
1238             }
1239             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
1240             {
1241                 return wrapMat(preferableBackend, preferableTarget, host);
1242             }
1243             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
1244             {
1245                 return wrapMat(preferableBackend, preferableTarget, host);
1246             }
1247             else if (preferableBackend == DNN_BACKEND_VKCOM)
1248             {
1249   #ifdef HAVE_VULKAN
1250                 return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
1251   #endif
1252             }
1253             else if (preferableBackend == DNN_BACKEND_CUDA)
1254             {
1255                 CV_Assert(haveCUDA());
1256 #ifdef HAVE_CUDA
1257                 switch (preferableTarget)
1258                 {
1259                 case DNN_TARGET_CUDA:
1260                     return CUDABackendWrapperFP32::create(baseBuffer, shape);
1261                 case DNN_TARGET_CUDA_FP16:
1262                     return CUDABackendWrapperFP16::create(baseBuffer, shape);
1263                 default:
1264                     CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
1265                 }
1266 #endif
1267             }
1268             else
1269                 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1270         }
1271
1272         Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
1273         backendWrappers[data] = wrapper;
1274         return wrapper;
1275     }
1276
1277 #ifdef HAVE_HALIDE
1278     void compileHalide()
1279     {
1280         CV_TRACE_FUNCTION();
1281
1282         CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
1283
1284         HalideScheduler scheduler(halideConfigFile);
1285         std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
1286         for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
1287         {
1288             LayerData &ld = it->second;
1289             Ptr<Layer> layer = ld.layerInstance;
1290             if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
1291             {
1292                 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
1293                 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
1294                 if (!scheduled)
1295                 {
1296                     // Use automatic scheduling provided by layer.
1297                     layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1298                                                 ld.inputBlobs, ld.outputBlobs,
1299                                                 preferableTarget);
1300                 }
1301                 compileList.emplace_back(ld);
1302             }
1303         }
1304         std::atomic<int> progress(0);
1305         auto fn = ([&] () -> void
1306         {
1307             for (;;)
1308             {
1309                 int id = progress.fetch_add(1);
1310                 if ((size_t)id >= compileList.size())
1311                     return;
1312                 const LayerData& ld = compileList[id].get();
1313                 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1314                 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1315             }
1316         });
1317         size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1318         num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1319         std::vector<std::thread> threads(num_threads - 1);
1320         for (auto& t: threads) t = std::thread(fn);
1321         fn(); // process own tasks
1322         for (auto& t: threads) t.join();
1323     }
1324 #endif
1325
1326     void clear()
1327     {
1328         CV_TRACE_FUNCTION();
1329
1330         MapIdToLayerData::iterator it;
1331         for (it = layers.begin(); it != layers.end(); it++)
1332         {
1333             if (it->second.id != 0) {
1334                 it->second.inputBlobs.clear();
1335                 it->second.outputBlobs.clear();
1336                 it->second.internals.clear();
1337             }
1338             it->second.skip = false;
1339             //it->second.consumers.clear();
1340             Ptr<Layer> currLayer = it->second.layerInstance;
1341
1342             if( currLayer.empty() )
1343                 continue;
1344
1345             currLayer->unsetAttached();
1346         }
1347
1348         layersTimings.clear();
1349     }
1350
1351     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1352     {
1353         CV_TRACE_FUNCTION();
1354
1355         if (dumpLevel && networkDumpCounter == 0)
1356         {
1357             dumpNetworkToFile();
1358         }
1359
1360         if (preferableBackend == DNN_BACKEND_DEFAULT)
1361             preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1362 #ifdef HAVE_INF_ENGINE
1363         if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1364             preferableBackend = getInferenceEngineBackendTypeParam();
1365 #endif
1366
1367         CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1368                   preferableTarget == DNN_TARGET_CPU ||
1369                   preferableTarget == DNN_TARGET_OPENCL ||
1370                   preferableTarget == DNN_TARGET_OPENCL_FP16);
1371         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1372                   preferableTarget == DNN_TARGET_CPU ||
1373                   preferableTarget == DNN_TARGET_OPENCL);
1374         if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
1375             preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
1376         {
1377             CV_Assert(
1378                   preferableTarget == DNN_TARGET_CPU ||
1379                   preferableTarget == DNN_TARGET_OPENCL ||
1380                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1381                   preferableTarget == DNN_TARGET_MYRIAD ||
1382                   preferableTarget == DNN_TARGET_FPGA
1383             );
1384         }
1385         CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
1386                   preferableTarget == DNN_TARGET_VULKAN);
1387         CV_Assert(preferableBackend != DNN_BACKEND_CUDA ||
1388                   IS_DNN_CUDA_TARGET(preferableTarget));
1389         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1390         {
1391             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1392 #ifndef HAVE_OPENCL
1393             {
1394                 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1395                 preferableTarget = DNN_TARGET_CPU;
1396             }
1397 #else
1398             {
1399                 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1400                 {
1401                     // Current implementation is only valid for GPU (#11494)
1402                     if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1403                     {
1404                         CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1405                         preferableTarget = DNN_TARGET_CPU;
1406                     }
1407                     else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1408                     {
1409                         CV_LOG_WARNING(NULL,
1410                             "DNN: OpenCL target with fp16 precision is not supported "
1411                             "with current OpenCL device (tested with Intel GPUs only), "
1412                             "switching to OpenCL with fp32 precision.");
1413                         preferableTarget = DNN_TARGET_OPENCL;
1414                     }
1415                 }
1416             }
1417 #endif
1418             if (preferableBackend == DNN_BACKEND_VKCOM && !haveVulkan())
1419             {
1420                 preferableBackend = DNN_BACKEND_OPENCV;
1421                 preferableTarget = DNN_TARGET_CPU;
1422             }
1423
1424             if (preferableBackend == DNN_BACKEND_CUDA && !haveCUDA())
1425             {
1426 #ifdef HAVE_CUDA
1427                 CV_LOG_WARNING(NULL, "unable to use CUDA backend; switching to CPU");
1428 #else
1429                 CV_LOG_WARNING(NULL, "DNN module was not built with CUDA backend; switching to CPU");
1430 #endif
1431                 preferableBackend = DNN_BACKEND_OPENCV;
1432                 preferableTarget = DNN_TARGET_CPU;
1433             }
1434
1435             clear();
1436
1437             this->blobsToKeep = blobsToKeep_;
1438
1439             allocateLayers(blobsToKeep_);
1440
1441             MapIdToLayerData::iterator it = layers.find(0);
1442             CV_Assert(it != layers.end());
1443             it->second.skip = netInputLayer->skip;
1444
1445             initBackend(blobsToKeep_);
1446
1447             if (!netWasAllocated)
1448             {
1449 #ifdef HAVE_HALIDE
1450                 if (preferableBackend == DNN_BACKEND_HALIDE)
1451                     compileHalide();
1452 #else
1453                 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1454 #endif
1455             }
1456
1457             netWasAllocated = true;
1458
1459             if (dumpLevel)
1460             {
1461                 dumpNetworkToFile();
1462             }
1463         }
1464     }
1465
1466     int getLayerId(const String &layerName)
1467     {
1468         std::map<String, int>::iterator it = layerNameToId.find(layerName);
1469         return (it != layerNameToId.end()) ? it->second : -1;
1470     }
1471
1472     int getLayerId(int id)
1473     {
1474         MapIdToLayerData::iterator it = layers.find(id);
1475         return (it != layers.end()) ? id : -1;
1476     }
1477
1478     int getLayerId(DictValue &layerDesc)
1479     {
1480         if (layerDesc.isInt())
1481             return getLayerId(layerDesc.get<int>());
1482         else if (layerDesc.isString())
1483             return getLayerId(layerDesc.get<String>());
1484
1485         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1486         return -1;
1487     }
1488
1489     String getLayerName(int id)
1490     {
1491         MapIdToLayerData::iterator it = layers.find(id);
1492         return (it != layers.end()) ? it->second.name : "(unknown layer)";
1493     }
1494
1495     LayerData& getLayerData(int id)
1496     {
1497         MapIdToLayerData::iterator it = layers.find(id);
1498
1499         if (it == layers.end())
1500             CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1501
1502         return it->second;
1503     }
1504
1505     LayerData& getLayerData(const String &layerName)
1506     {
1507         int id = getLayerId(layerName);
1508
1509         if (id < 0)
1510             CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1511
1512         return getLayerData(id);
1513     }
1514
1515     LayerData& getLayerData(const DictValue &layerDesc)
1516     {
1517         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1518         if (layerDesc.isInt())
1519             return getLayerData(layerDesc.get<int>());
1520         else /*if (layerDesc.isString())*/
1521             return getLayerData(layerDesc.get<String>());
1522     }
1523
1524     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1525     {
1526         if ((int)ld.inputBlobsId.size() <= inNum)
1527         {
1528             ld.inputBlobsId.resize(inNum + 1);
1529         }
1530         else
1531         {
1532             LayerPin storedFrom = ld.inputBlobsId[inNum];
1533             if (storedFrom.valid() && !storedFrom.equal(from))
1534                 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1535                                                  inNum, ld.name.c_str()));
1536         }
1537
1538         ld.inputBlobsId[inNum] = from;
1539     }
1540
1541     int resolvePinOutputName(LayerData &ld, const String &outName)
1542     {
1543         if (outName.empty())
1544             return 0;
1545         return ld.getLayerInstance()->outputNameToIndex(outName);
1546     }
1547
1548     LayerPin getPinByAlias(const String &layerName)
1549     {
1550         LayerPin pin;
1551         pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1552
1553         if (pin.lid >= 0)
1554             pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1555
1556         return pin;
1557     }
1558
1559     std::vector<LayerPin> getLayerOutPins(const String &layerName)
1560     {
1561         int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1562
1563         std::vector<LayerPin> pins;
1564
1565         for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1566         {
1567             pins.push_back(LayerPin(lid, i));
1568         }
1569
1570         return pins;
1571     }
1572
1573     void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1574     {
1575         CV_Assert(outLayerId < inLayerId);
1576         LayerData &ldOut = getLayerData(outLayerId);
1577         LayerData &ldInp = getLayerData(inLayerId);
1578
1579         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1580         ldOut.requiredOutputs.insert(outNum);
1581         ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1582     }
1583
1584     void initBackend(const std::vector<LayerPin>& blobsToKeep_)
1585     {
1586         CV_TRACE_FUNCTION();
1587         if (preferableBackend == DNN_BACKEND_OPENCV)
1588         {
1589             CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1590         }
1591         else if (preferableBackend == DNN_BACKEND_HALIDE)
1592             initHalideBackend();
1593         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
1594         {
1595 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
1596             initInfEngineBackend(blobsToKeep_);
1597 #else
1598             CV_Assert(false && "This OpenCV version is built without Inference Engine NN Builder API support");
1599 #endif
1600         }
1601         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
1602         {
1603 #ifdef HAVE_DNN_NGRAPH
1604             initNgraphBackend(blobsToKeep_);
1605 #else
1606             CV_Error(Error::StsNotImplemented, "This OpenCV version is built without support of Inference Engine + nGraph");
1607 #endif
1608         }
1609         else if (preferableBackend == DNN_BACKEND_VKCOM)
1610             initVkComBackend();
1611         else if (preferableBackend == DNN_BACKEND_CUDA)
1612             initCUDABackend(blobsToKeep_);
1613         else
1614             CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1615     }
1616
1617     void initHalideBackend()
1618     {
1619         CV_TRACE_FUNCTION();
1620         CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1621
1622         // Iterator to current layer.
1623         MapIdToLayerData::iterator it = layers.begin();
1624         // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1625         // it'll be a conv layer.
1626         MapIdToLayerData::iterator baseIt = layers.begin();
1627         for (; it != layers.end(); it++)
1628         {
1629             LayerData &ldTop = it->second;
1630             Ptr<Layer> layerTop = ldTop.layerInstance;
1631             if (!layerTop->supportBackend(preferableBackend))
1632             {
1633                 // Move base iterator to layer that don't support preferable
1634                 // backend to prevent fusion over layer of different backend.
1635                 baseIt = it;
1636                 continue;
1637             }
1638             // Try to do layers fusion.
1639             LayerData &ldBot = baseIt->second;
1640             Ptr<Layer> layerBot = ldBot.layerInstance;
1641             // 1. Check that bottom and top from the same backends.
1642             if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1643             {
1644                 // 2. Check that current layer works in-place.
1645                 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1646                                ldBot.outputBlobs.size() == 1 &&
1647                                ldTop.inputBlobs[0]->data ==
1648                                ldBot.outputBlobs[0].data;
1649                 if (inPlace)
1650                 {
1651                     // 3. Try to attach node.
1652                     CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1653                     Ptr<BackendNode> fusedNode =
1654                         layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1655                     if (!fusedNode.empty())
1656                     {
1657                         ldTop.skip = true;
1658                         ldBot.backendNodes[preferableBackend] = fusedNode;
1659                         ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1660                         continue;
1661                     }
1662                 }
1663             }
1664             // No layers fusion.
1665             ldTop.skip = false;
1666             ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1667                 layerTop->initHalide(ldTop.inputBlobsWrappers);
1668             baseIt = it;
1669         }
1670     }
1671
1672 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
1673     // Before launching Inference Engine graph we need to specify output blobs.
1674     // This function requests output blobs based on inputs references of
1675     // layers from default backend or layers from different graphs.
1676     void addInfEngineNetOutputs(LayerData &ld)
1677     {
1678         CV_TRACE_FUNCTION();
1679         Ptr<InfEngineBackendNet> layerNet;
1680         if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1681         {
1682             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1683             if (!node.empty())
1684             {
1685                 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1686                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1687                 layerNet = ieNode->net;
1688             }
1689         }
1690         // For an every input reference we check that it belongs to one of
1691         // the Inference Engine backend graphs. Request an output blob if it is.
1692         // Do nothing if layer's input is from the same graph.
1693         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1694         {
1695             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1696             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1697             if (!inpNode.empty())
1698             {
1699                 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1700                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1701                 if (layerNet != ieInpNode->net)
1702                 {
1703                     // layerNet is empty or nodes are from different graphs.
1704                     ieInpNode->net->addOutput(ieInpNode->layer.getName());
1705                 }
1706             }
1707         }
1708     }
1709
1710     void initInfEngineBackend(const std::vector<LayerPin>& blobsToKeep_)
1711     {
1712         CV_TRACE_FUNCTION();
1713         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019, haveInfEngine());
1714         MapIdToLayerData::iterator it;
1715         Ptr<InfEngineBackendNet> net;
1716
1717         for (it = layers.begin(); it != layers.end(); ++it)
1718         {
1719             LayerData &ld = it->second;
1720             if (ld.id == 0)
1721             {
1722                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1723                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1724                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1725                 {
1726                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1727 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1728                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1729 #else
1730                     dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
1731 #endif
1732                 }
1733             }
1734             else
1735             {
1736                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1737                 {
1738                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1739 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1740                     dataPtr->name = ld.name;
1741 #else
1742                     dataPtr->setName(ld.name);
1743 #endif
1744                 }
1745             }
1746         }
1747
1748         if (skipInfEngineInit)
1749         {
1750             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1751             CV_Assert(!node.empty());
1752
1753             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1754             CV_Assert(!ieNode.empty());
1755             ieNode->net->reset();
1756
1757             for (it = layers.begin(); it != layers.end(); ++it)
1758             {
1759                 LayerData &ld = it->second;
1760                 if (ld.id == 0)
1761                 {
1762                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1763                     {
1764                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1765 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1766                         dataPtr->name = netInputLayer->outNames[i];
1767 #else
1768                         dataPtr->setName(netInputLayer->outNames[i]);
1769 #endif
1770                     }
1771                 }
1772                 else
1773                 {
1774                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1775                     {
1776                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1777 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1778                         dataPtr->name = ld.name;
1779 #else
1780                         dataPtr->setName(ld.name);
1781 #endif
1782                     }
1783                 }
1784                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1785                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1786                 ld.skip = true;
1787             }
1788             layers[lastLayerId].skip = false;
1789             ieNode->net->init((Target)preferableTarget);
1790             return;
1791         }
1792
1793         // Build Inference Engine networks from sets of layers that support this
1794         // backend. Split a whole model on several Inference Engine networks if
1795         // some of layers are not implemented.
1796
1797         bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
1798                                    BackendRegistry::checkIETarget(DNN_TARGET_CPU);
1799
1800         // Set of all input and output blobs wrappers for current network.
1801         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1802         for (it = layers.begin(); it != layers.end(); ++it)
1803         {
1804             LayerData &ld = it->second;
1805             if (ld.id == 0 && ld.skip)
1806                 continue;
1807             bool fused = ld.skip;
1808
1809             Ptr<Layer> layer = ld.layerInstance;
1810             if (!fused && !layer->supportBackend(preferableBackend))
1811             {
1812                 bool customizable = ld.id != 0 &&
1813                                     INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2) &&
1814                                     supportsCPUFallback;
1815                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
1816                 if (preferableTarget == DNN_TARGET_MYRIAD)
1817                 {
1818                     for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
1819                     {
1820                         customizable = ld.inputBlobs[i]->size[0] == 1;
1821                     }
1822                 }
1823
1824                 // TODO: fix these workarounds
1825                 if (preferableTarget == DNN_TARGET_MYRIAD ||
1826                     preferableTarget == DNN_TARGET_OPENCL ||
1827                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1828                     customizable &= ld.type != "Concat";
1829
1830                 if (preferableTarget == DNN_TARGET_OPENCL ||
1831                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1832                     customizable &= ld.type != "Power";
1833
1834                 if (preferableTarget == DNN_TARGET_OPENCL)
1835                     customizable &= ld.type != "Eltwise";
1836
1837                 if (!customizable)
1838                 {
1839                     addInfEngineNetOutputs(ld);
1840                     net = Ptr<InfEngineBackendNet>();
1841                     netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1842                     layer->preferableTarget = DNN_TARGET_CPU;
1843                     continue;
1844                 }
1845             }
1846             ld.skip = true;  // Initially skip all Inference Engine supported layers.
1847
1848             // Create a new network if one of inputs from different Inference Engine graph.
1849             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1850             {
1851                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1852                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1853                 if (!inpNode.empty())
1854                 {
1855                     Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1856                     CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1857                     if (ieInpNode->net != net)
1858                     {
1859                         net = Ptr<InfEngineBackendNet>();
1860                         netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1861                         break;
1862                     }
1863                 }
1864             }
1865
1866             Ptr<BackendNode> node;
1867             if (!net.empty())
1868             {
1869                 if (fused)
1870                 {
1871                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1872                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1873                     CV_Assert(inPlace);
1874                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1875                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1876                 }
1877             }
1878             else
1879                 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1880
1881             if (!fused)
1882             {
1883                 if (layer->supportBackend(preferableBackend))
1884                     node = layer->initInfEngine(ld.inputBlobsWrappers);
1885                 else
1886                 {
1887                     node = Ptr<BackendNode>(new InfEngineBackendNode(
1888                         ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
1889                 }
1890             }
1891             else if (node.empty())
1892                 continue;
1893
1894             CV_Assert(!node.empty());
1895             ld.backendNodes[preferableBackend] = node;
1896
1897             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1898             CV_Assert(!ieNode.empty());
1899             ieNode->net = net;
1900
1901             for (const auto& pin : blobsToKeep_)
1902             {
1903                 if (pin.lid == ld.id)
1904                 {
1905                     ieNode->net->addOutput(ieNode->layer.getName());
1906                     break;
1907                 }
1908             }
1909
1910             // Convert weights in FP16 for specific targets.
1911             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1912                  preferableTarget == DNN_TARGET_MYRIAD ||
1913                  preferableTarget == DNN_TARGET_FPGA) && !fused)
1914             {
1915 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
1916                 for (const std::string& name : {"weights", "biases"})
1917                 {
1918                     auto it = ieNode->layer.getParameters().find(name);
1919                     if (it != ieNode->layer.getParameters().end())
1920                     {
1921                         InferenceEngine::Blob::Ptr bp = it->second.as<InferenceEngine::Blob::Ptr>();
1922                         it->second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(bp));
1923                     }
1924                 }
1925 #else
1926                 auto& blobs = ieNode->layer.getConstantData();
1927                 if (blobs.empty())
1928                 {
1929                     // In case of non weightable layer we have to specify
1930                     // it's precision adding dummy blob.
1931                     auto blob = InferenceEngine::make_shared_blob<int16_t>(
1932                                     InferenceEngine::Precision::FP16,
1933                                     InferenceEngine::Layout::C, {1});
1934                     blob->allocate();
1935                     blobs[""] = blob;
1936                 }
1937                 else
1938                 {
1939                     for (auto& it : blobs)
1940                         it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
1941                 }
1942 #endif
1943             }
1944
1945             if (!fused)
1946                 net->addLayer(ieNode->layer);
1947
1948             net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
1949             net->addBlobs(ld.inputBlobsWrappers);
1950             net->addBlobs(ld.outputBlobsWrappers);
1951             addInfEngineNetOutputs(ld);
1952         }
1953
1954         // Initialize all networks.
1955         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1956         {
1957             LayerData &ld = it->second;
1958             if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1959                 continue;
1960
1961             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1962             if (node.empty())
1963                 continue;
1964
1965             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1966             if (ieNode.empty())
1967                 continue;
1968
1969             CV_Assert(!ieNode->net.empty());
1970
1971             if (!ieNode->net->isInitialized())
1972             {
1973                 ieNode->net->init((Target)preferableTarget);
1974                 ld.skip = false;
1975             }
1976         }
1977     }
1978 #endif  // HAVE_DNN_IE_NN_BUILDER_2019
1979
1980
1981 #ifdef HAVE_DNN_NGRAPH
1982     void addNgraphOutputs(LayerData &ld)
1983     {
1984         CV_TRACE_FUNCTION();
1985
1986         Ptr<InfEngineNgraphNet> layerNet;
1987         auto it = ld.backendNodes.find(preferableBackend);
1988         if (it != ld.backendNodes.end())
1989         {
1990             Ptr<BackendNode> node = it->second;
1991             if (!node.empty())
1992             {
1993                 Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
1994                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1995                 layerNet = ieNode->net;
1996             }
1997         }
1998
1999         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
2000         {
2001             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
2002             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
2003             if (!inpNode.empty())
2004             {
2005                 Ptr<InfEngineNgraphNode> ieInpNode = inpNode.dynamicCast<InfEngineNgraphNode>();
2006                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
2007                 if (layerNet != ieInpNode->net)
2008                 {
2009                     ieInpNode->net->addOutput(ieInpNode->node->get_friendly_name());
2010                     ieInpNode->net->setUnconnectedNodes(ieInpNode);
2011                 }
2012             }
2013         }
2014     }
2015
2016     void initNgraphBackend(const std::vector<LayerPin>& blobsToKeep_)
2017     {
2018         CV_TRACE_FUNCTION();
2019         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH, haveInfEngine());
2020
2021         MapIdToLayerData::iterator it;
2022         Ptr<InfEngineNgraphNet> net;
2023
2024         for (it = layers.begin(); it != layers.end(); ++it)
2025         {
2026             LayerData &ld = it->second;
2027             if (ld.id == 0)
2028             {
2029                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
2030                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
2031                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2032                 {
2033                     InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
2034                     std::string outputName = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
2035                     outputName = ld.outputBlobsWrappers.size() > 1 ? (outputName + "." + std::to_string(i)) : outputName;
2036                     dataPtr->setName(outputName);
2037                 }
2038             }
2039             else
2040             {
2041                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2042                 {
2043                     InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
2044                     std::string outputName = ld.outputBlobsWrappers.size() > 1 ? (ld.name + "." + std::to_string(i)) : ld.name;
2045                     dataPtr->setName(outputName);
2046                 }
2047             }
2048         }
2049
2050         if (skipInfEngineInit)
2051         {
2052             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
2053             CV_Assert(!node.empty());
2054
2055             Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
2056             CV_Assert(!ieNode.empty());
2057             ieNode->net->reset();
2058
2059             for (it = layers.begin(); it != layers.end(); ++it)
2060             {
2061                 LayerData &ld = it->second;
2062                 if (ld.id == 0)
2063                 {
2064                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
2065                     {
2066                         InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.inputBlobsWrappers[i]);
2067                         dataPtr->setName(netInputLayer->outNames[i]);
2068                     }
2069                 }
2070                 else
2071                 {
2072                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2073                     {
2074                         InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
2075                         dataPtr->setName(ld.name);
2076                     }
2077                 }
2078                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
2079                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
2080                 ld.skip = true;
2081             }
2082             layers[lastLayerId].skip = false;
2083             ieNode->net->init((Target)preferableTarget);
2084             return;
2085         }
2086
2087         bool supportsCPUFallback = preferableTarget == DNN_TARGET_CPU ||
2088                                    BackendRegistry::checkIETarget(DNN_TARGET_CPU);
2089
2090         // Build Inference Engine networks from sets of layers that support this
2091         // backend. Split a whole model on several Inference Engine networks if
2092         // some of layers are not implemented.
2093         for (it = layers.begin(); it != layers.end(); ++it)
2094         {
2095             LayerData &ld = it->second;
2096
2097             if (ld.id == 0 && ld.skip)
2098                 continue;
2099
2100             bool fused = ld.skip;
2101             Ptr<Layer> layer = ld.layerInstance;
2102             if (!fused && !layer->supportBackend(preferableBackend))
2103             {
2104                 bool customizable = ld.id != 0 && supportsCPUFallback;
2105
2106                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
2107                 if (preferableTarget == DNN_TARGET_MYRIAD)
2108                 {
2109                     for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
2110                     {
2111                         customizable = ld.inputBlobs[i]->size[0] == 1;
2112                     }
2113                 }
2114
2115                 // TODO: fix these workarounds
2116                 if (preferableTarget == DNN_TARGET_MYRIAD ||
2117                     preferableTarget == DNN_TARGET_OPENCL ||
2118                     preferableTarget == DNN_TARGET_OPENCL_FP16)
2119                     customizable &= ld.type != "Concat";
2120
2121                 if (preferableTarget == DNN_TARGET_OPENCL ||
2122                     preferableTarget == DNN_TARGET_OPENCL_FP16)
2123                     customizable &= ld.type != "Power";
2124
2125                 if (preferableTarget == DNN_TARGET_OPENCL)
2126                     customizable &= ld.type != "Eltwise";
2127
2128                 if (!customizable)
2129                 {
2130                     addNgraphOutputs(ld);
2131                     net = Ptr<InfEngineNgraphNet>();
2132                     layer->preferableTarget = DNN_TARGET_CPU;
2133
2134                     for (int i = 0; i < ld.inputBlobsId.size(); ++i)
2135                     {
2136                         LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
2137                         Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
2138                         if (!inpNode.empty()) {
2139                             Ptr<InfEngineNgraphNode> ieNode = inpNode.dynamicCast<InfEngineNgraphNode>();
2140                             CV_Assert(!ieNode.empty());
2141                             ieNode->net->setUnconnectedNodes(ieNode);
2142                         }
2143                     }
2144                     continue;
2145                 }
2146             }
2147             ld.skip = true;  // Initially skip all Inference Engine supported layers.
2148
2149             // Create a new network if one of inputs from different Inference Engine graph.
2150             std::vector<Ptr<BackendNode>> inputNodes;
2151             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
2152             {
2153                 // Layer_Test_ROIPooling.Accuracy has 2 inputs inpLD = 0, 0 -> has 4 inputNodes (input, rois, input, rois)
2154                 if (inputNodes.size() == ld.inputBlobsId.size()) {
2155                     break;
2156                 }
2157                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
2158                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
2159                 if (!inpNode.empty())
2160                 {
2161                      Ptr<InfEngineNgraphNode> ieInpNode = inpNode.dynamicCast<InfEngineNgraphNode>();
2162                      CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
2163                      if (ieInpNode->net == net && !fused) {
2164                         inputNodes.push_back(inpNode);
2165                         continue;
2166                      }
2167                 }
2168
2169                 if (net.empty()) {
2170                     net = Ptr<InfEngineNgraphNet>(new InfEngineNgraphNet(*this));
2171                 }
2172
2173                 if (!fused) {
2174                     std::vector<std::string> inputNames;
2175                     std::vector<cv::Mat> inputs;
2176
2177                     auto curr_pos = inpLd.consumers.begin();
2178                     auto compare = [&ld] (const LayerPin& lp) { return lp.lid == ld.id; };
2179                     auto cons = curr_pos;
2180                     while ((cons = std::find_if(curr_pos, inpLd.consumers.end(), compare)) !=
2181                             inpLd.consumers.end()) {
2182                         int cons_inp = cons->oid;
2183                         Ptr<NgraphBackendWrapper> inpWrapper = inpLd.outputBlobsWrappers[cons_inp].
2184                                                                      dynamicCast<NgraphBackendWrapper>();
2185                         CV_Assert(!inpWrapper.empty());
2186                         auto iter = std::find(inputNames.begin(), inputNames.end(),
2187                                               inpWrapper->dataPtr->getName());
2188                         if (iter == inputNames.end()) {
2189                             inputNames.push_back(inpWrapper->dataPtr->getName());
2190                             inputs.push_back(inpLd.outputBlobs[cons_inp]);
2191                         }
2192                         curr_pos = cons + 1;
2193                     }
2194
2195                     auto inps = net->setInputs(inputs, inputNames);
2196                     for (auto& inp : inps) {
2197                         inputNodes.emplace_back(Ptr<BackendNode>(new InfEngineNgraphNode(inp)));
2198                     }
2199                 }
2200             }
2201
2202             Ptr<BackendNode> node;
2203             if (!net.empty())
2204             {
2205                 if (fused)
2206                 {
2207                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
2208                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
2209                     CV_Assert(inPlace);
2210                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
2211                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
2212                 }
2213             }
2214             else {
2215                 net = Ptr<InfEngineNgraphNet>(new InfEngineNgraphNet(*this));
2216             }
2217
2218             if (!fused)
2219             {
2220                 CV_Assert(ld.inputBlobsId.size() == inputNodes.size());
2221                 for (int i = 0; i < ld.inputBlobsId.size(); ++i)
2222                 {
2223                     int lid = ld.inputBlobsId[i].lid;
2224                     int oid = ld.inputBlobsId[i].oid;
2225                     if (oid == 0 || lid == 0)
2226                         continue;
2227
2228                     auto ieInpNode = inputNodes[i].dynamicCast<InfEngineNgraphNode>();
2229                     CV_Assert(oid < ieInpNode->node->get_output_size());
2230 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
2231                     inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node));
2232 #elif INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_3)
2233                     inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid)));
2234 #else
2235                     inputNodes[i] = Ptr<BackendNode>(new InfEngineNgraphNode(ieInpNode->node->get_output_as_single_output_node(oid, false)));
2236 #endif
2237                 }
2238
2239                 if (layer->supportBackend(preferableBackend))
2240                 {
2241                     node = layer->initNgraph(ld.inputBlobsWrappers, inputNodes);
2242                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2243                     {
2244                         InferenceEngine::DataPtr dataPtr = ngraphDataNode(ld.outputBlobsWrappers[i]);
2245                         node.dynamicCast<InfEngineNgraphNode>()->setName(dataPtr->getName());
2246                     }
2247                 }
2248                 else
2249                 {
2250                     node = Ptr<BackendNode>(new InfEngineNgraphNode(inputNodes,
2251                         ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
2252                 }
2253             }
2254             else if (node.empty())
2255                 continue;
2256
2257             ld.backendNodes[preferableBackend] = node;
2258
2259             Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
2260             CV_Assert(!ieNode.empty());
2261             ieNode->net = net;
2262
2263             if (ld.consumers.empty()) {
2264                 // TF EAST_text_detection
2265                 ieNode->net->setUnconnectedNodes(ieNode);
2266             }
2267             for (const auto& pin : blobsToKeep_)
2268             {
2269                 if (pin.lid == ld.id)
2270                 {
2271                     ieNode->net->addOutput(ieNode->node->get_friendly_name());
2272                     break;
2273                 }
2274             }
2275             ieNode->net->setNodePtr(&ieNode->node);
2276
2277             net->addBlobs(ld.inputBlobsWrappers);
2278             net->addBlobs(ld.outputBlobsWrappers);
2279             addNgraphOutputs(ld);
2280         }
2281
2282         // Initialize all networks.
2283         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
2284         {
2285             LayerData &ld = it->second;
2286             auto iter = ld.backendNodes.find(preferableBackend);
2287             if (iter == ld.backendNodes.end())
2288                 continue;
2289
2290             Ptr<BackendNode>& node = iter->second;
2291             if (node.empty())
2292                 continue;
2293
2294             Ptr<InfEngineNgraphNode> ieNode = node.dynamicCast<InfEngineNgraphNode>();
2295             if (ieNode.empty())
2296                 continue;
2297
2298             CV_Assert(!ieNode->net.empty());
2299
2300             if (!ieNode->net->isInitialized())
2301             {
2302                 ieNode->net->setUnconnectedNodes(ieNode);
2303                 ieNode->net->createNet((Target)preferableTarget);
2304                 ld.skip = false;
2305             }
2306         }
2307     }
2308 #endif  // HAVE_DNN_NGRAPH
2309
2310     void initVkComBackend()
2311     {
2312         CV_TRACE_FUNCTION();
2313         CV_Assert(preferableBackend == DNN_BACKEND_VKCOM);
2314 #ifdef HAVE_VULKAN
2315         if (!haveVulkan())
2316             return;
2317
2318         MapIdToLayerData::iterator it = layers.begin();
2319         for (; it != layers.end(); it++)
2320         {
2321             LayerData &ld = it->second;
2322             Ptr<Layer> layer = ld.layerInstance;
2323             if (!layer->supportBackend(preferableBackend))
2324             {
2325                 continue;
2326             }
2327
2328             ld.skip = false;
2329
2330             try
2331             {
2332                 ld.backendNodes[DNN_BACKEND_VKCOM] =
2333                     layer->initVkCom(ld.inputBlobsWrappers);
2334             }
2335             catch (const cv::Exception& e)
2336             {
2337                 CV_LOG_ERROR(NULL, "initVkCom failed, fallback to CPU implementation. " << e.what());
2338                 ld.backendNodes[DNN_BACKEND_VKCOM] = Ptr<BackendNode>();
2339             }
2340         }
2341 #endif
2342     }
2343
2344     void initCUDABackend(const std::vector<LayerPin>& blobsToKeep_)
2345     {
2346         CV_Assert(haveCUDA());
2347         CV_Assert(preferableBackend == DNN_BACKEND_CUDA);
2348
2349 #ifdef HAVE_CUDA
2350         if (cuda4dnn::getDeviceCount() <= 0)
2351             CV_Error(Error::StsError, "No CUDA capable device found.");
2352
2353         if (cuda4dnn::getDevice() < 0)
2354             CV_Error(Error::StsError, "No CUDA capable device selected.");
2355
2356         if (!cuda4dnn::isDeviceCompatible())
2357             CV_Error(Error::GpuNotSupported, "OpenCV was not built to work with the selected device. Please check CUDA_ARCH_PTX or CUDA_ARCH_BIN in your build configuration.");
2358
2359         if (preferableTarget == DNN_TARGET_CUDA_FP16 && !cuda4dnn::doesDeviceSupportFP16())
2360             CV_Error(Error::StsError, "The selected CUDA device does not support FP16 operations.");
2361
2362         if (!cudaInfo)
2363         {
2364             cuda4dnn::csl::CSLContext context;
2365             context.stream = cuda4dnn::csl::Stream(true);
2366             context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
2367             context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
2368
2369             auto d2h_stream = cuda4dnn::csl::Stream(true); // stream for background D2H data transfers
2370             cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context), std::move(d2h_stream)));
2371             cuda4dnn::checkVersions();
2372         }
2373
2374         cudaInfo->workspace = cuda4dnn::csl::Workspace(); // release workspace memory if any
2375
2376         for (auto& layer : layers)
2377         {
2378             auto& ld = layer.second;
2379             if (ld.id == 0)
2380             {
2381                 for (auto& wrapper : ld.inputBlobsWrappers)
2382                 {
2383                     auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
2384                     cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
2385                 }
2386             }
2387
2388             for (auto& wrapper : ld.outputBlobsWrappers)
2389             {
2390                 auto cudaWrapper = wrapper.dynamicCast<CUDABackendWrapper>();
2391                 cudaWrapper->setStream(cudaInfo->context.stream, cudaInfo->d2h_stream);
2392             }
2393         }
2394
2395         for (auto& layer : layers)
2396         {
2397             auto& ld = layer.second;
2398             auto& layerInstance = ld.layerInstance;
2399
2400             if (!layerInstance->supportBackend(DNN_BACKEND_CUDA))
2401             {
2402                 std::ostringstream os;
2403                 os << "CUDA backend will fallback to the CPU implementation for the layer \"" << ld.name
2404                    << "\" of type " << ld.type << '\n';
2405                 CV_LOG_INFO(NULL, os.str().c_str());
2406                 continue;
2407             }
2408
2409             /* we make a copy so that `initCUDA` doesn't modify `cudaInfo->context` */
2410             auto context = cudaInfo->context;
2411             auto node = layerInstance->initCUDA(&context, ld.inputBlobsWrappers, ld.outputBlobsWrappers);
2412             ld.backendNodes[DNN_BACKEND_CUDA] = node;
2413
2414             auto cudaNode = node.dynamicCast<CUDABackendNode>();
2415             cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
2416         }
2417
2418         if (blobsToKeep_.size() > 1)
2419         {
2420             for (const auto& pin : blobsToKeep_)
2421             {
2422                 LayerData& ld = layers[pin.lid];
2423                 ld.cudaD2HBackgroundTransfers.push_back(pin.oid);
2424             }
2425         }
2426 #endif
2427     }
2428
2429     void allocateLayer(int lid, const LayersShapesMap& layersShapes)
2430     {
2431         CV_TRACE_FUNCTION();
2432
2433         LayerData &ld = layers[lid];
2434
2435         //already allocated
2436         if (ld.flag)
2437             return;
2438
2439         size_t ninputs = ld.inputBlobsId.size();
2440 #if 0
2441         printf("layer %s:", ld.name.c_str());
2442         for (size_t i = 0; i < ninputs; i++)
2443         {
2444             int inp_lid = ld.inputBlobsId[i].lid;
2445             LayerData &inp_ld = layers[inp_lid];
2446             int inp_outputs = (int)inp_ld.outputBlobs.size();
2447             std::cout << " " << inp_ld.name << "(" << inp_outputs;
2448
2449             for( int j = 0; j < inp_outputs; j++ )
2450             {
2451                 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
2452             }
2453             std::cout << ")";
2454         }
2455         printf("\n");
2456 #endif
2457
2458         //determine parent layers
2459         for (size_t i = 0; i < ninputs; i++)
2460             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
2461
2462         //allocate parents
2463         for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
2464             allocateLayer(*i, layersShapes);
2465
2466         //bind inputs
2467         if (ld.id == 0)  // DataLayer
2468         {
2469             ninputs = netInputLayer->inputsData.size();
2470             ld.inputBlobsWrappers.resize(ninputs);
2471             for (size_t i = 0; i < ninputs; i++)
2472                 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
2473         }
2474         else
2475         {
2476             ld.inputBlobs.resize(ninputs);
2477             ld.inputBlobsWrappers.resize(ninputs);
2478             for (size_t i = 0; i < ninputs; i++)
2479             {
2480                 LayerPin from = ld.inputBlobsId[i];
2481                 CV_Assert(from.valid());
2482                 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
2483                 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
2484                 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
2485             }
2486         }
2487
2488         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
2489
2490         CV_Assert(layerShapesIt != layersShapes.end());
2491
2492         std::vector<LayerPin> pinsForInternalBlobs;
2493         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
2494                                           preferableBackend == DNN_BACKEND_OPENCV &&
2495                                           preferableTarget == DNN_TARGET_OPENCL_FP16);
2496         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
2497         for (int i = 0; i < ld.outputBlobs.size(); ++i)
2498             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
2499
2500         /* CUDA backend has its own system for internal blobs; we don't need these */
2501         ld.internalBlobsWrappers.resize((preferableBackend == DNN_BACKEND_CUDA) ? 0 : ld.internals.size());
2502         for (int i = 0; i < ld.internalBlobsWrappers.size(); ++i)
2503             ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
2504
2505         Ptr<Layer> layerPtr = ld.getLayerInstance();
2506         {
2507             std::vector<Mat> inps(ld.inputBlobs.size());
2508             for (int i = 0; i < ld.inputBlobs.size(); ++i)
2509             {
2510                 inps[i] = *ld.inputBlobs[i];
2511             }
2512             layerPtr->finalize(inps, ld.outputBlobs);
2513             layerPtr->preferableTarget = preferableTarget;
2514 #if 0
2515             std::cout << "\toutputs:";
2516             size_t noutputs = ld.outputBlobs.size();
2517             for (size_t j = 0; j < noutputs; j++)
2518             {
2519                 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
2520             }
2521             std::cout << "\n";
2522 #endif
2523         }
2524
2525         // After allocation of layer, we decrease counters to it's input blobs.
2526         blobManager.releaseReferences(ld.inputBlobsId);
2527         blobManager.releaseReferences(pinsForInternalBlobs);
2528
2529         ld.flag = 1;
2530     }
2531
2532 #if 0
2533 #define printf_(args) printf args
2534 #else
2535 #define printf_(args)
2536 #endif
2537
2538     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
2539     {
2540         CV_TRACE_FUNCTION();
2541
2542         if(!fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
2543                         preferableBackend != DNN_BACKEND_CUDA &&
2544                         preferableBackend != DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 &&
2545                         preferableBackend != DNN_BACKEND_INFERENCE_ENGINE_NGRAPH))
2546            return;
2547
2548         // scan through all the layers. If there is convolution layer followed by the activation layer,
2549         // we try to embed this activation into the convolution and disable separate execution of the activation
2550         std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
2551                                       blobsToKeep_.end());
2552         MapIdToLayerData::iterator it;
2553         for (it = layers.begin(); it != layers.end(); it++)
2554         {
2555             int lid = it->first;
2556             LayerData& ld = layers[lid];
2557             if( ld.skip )
2558             {
2559                 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
2560                 continue;
2561             }
2562             printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
2563
2564             // the optimization #1. try to fuse batch norm, scaling and/or activation layers
2565             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
2566             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
2567             // some other layers.
2568             Ptr<Layer>& currLayer = ld.layerInstance;
2569             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
2570             {
2571                 LayerData* nextData = &layers[ld.consumers[0].lid];
2572                 LayerPin lpNext(ld.consumers[0].lid, 0);
2573                 while (nextData)
2574                 {
2575                     /* we use `tryFuse` member of convolution layer to fuse eltwise later
2576                      * it's not intended to be fused here; hence, we stop when we encounter eltwise
2577                      */
2578                     if (preferableBackend == DNN_BACKEND_CUDA && ld.type == "Convolution" && nextData->type == "Eltwise")
2579                         break;
2580                     Ptr<Layer> nextLayer = nextData->layerInstance;
2581                     if (currLayer->tryFuse(nextLayer))
2582                     {
2583                         printf_(("\tfused with %s\n", nextLayer->name.c_str()));
2584                         nextData->skip = true;
2585                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
2586                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
2587                         if (nextData->consumers.size() == 1)
2588                         {
2589                             int nextLayerId = nextData->consumers[0].lid;
2590                             nextData = &layers[nextLayerId];
2591                             lpNext = LayerPin(nextLayerId, 0);
2592                         }
2593                         else
2594                         {
2595                             nextData = 0;
2596                             break;
2597                         }
2598                     }
2599                     else
2600                         break;
2601                 }
2602
2603                 if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
2604                     continue;  // Go to the next layer.
2605
2606                 // TODO: OpenCL target support more fusion styles.
2607                 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
2608                      (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
2609                      ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
2610                      ld.layerInstance->type != "Concat")) )
2611                     continue;
2612
2613                 if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget)
2614                     && ld.layerInstance->type != "Convolution"
2615                     && ld.layerInstance->type != "Concat")
2616                     continue;
2617
2618                 while (nextData)
2619                 {
2620                     // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
2621                     if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
2622                         nextData->type != "ReLU" &&
2623                         nextData->type != "ChannelsPReLU" &&
2624                         nextData->type != "ReLU6" &&
2625                         nextData->type != "TanH" &&
2626                         nextData->type != "Power")
2627                         break;
2628
2629                     Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2630                     if (nextActivLayer.empty())
2631                         break;
2632
2633                     if (currLayer->setActivation(nextActivLayer))
2634                     {
2635                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2636                         nextData->skip = true;
2637                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
2638                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
2639                         if (nextData->consumers.size() == 1)
2640                         {
2641                             int nextLayerId = nextData->consumers[0].lid;
2642                             nextData = &layers[nextLayerId];
2643                             lpNext = LayerPin(nextLayerId, 0);
2644                         }
2645                         else
2646                         {
2647                             nextData = 0;
2648                             break;
2649                         }
2650                     }
2651                     else
2652                         break;
2653                 }
2654
2655                 // OpenCL: fuse convolution layer followed by eltwise + relu
2656                 // CUDA: fuse convolution layer followed by eltwise (and optional activation)
2657                 while (nextData &&
2658                     (IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
2659                     ld.layerInstance->type == "Convolution"
2660                 )  // semantic of 'if'
2661                 {
2662                     Ptr<EltwiseLayer> nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
2663                     if (nextEltwiseLayer.empty())
2664                         break;
2665
2666 #ifdef HAVE_CUDA
2667                     // CUDA backend supports fusion with eltwise sum (without variable channels)
2668                     // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
2669                     if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
2670                     {
2671                         // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
2672                         cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
2673                         const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
2674                         const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
2675                         // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
2676                         // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
2677                         if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
2678                             nextEltwiseLayer = Ptr<EltwiseLayer>();
2679                     }
2680 #endif
2681
2682                     if (pinsToKeep.count(lpNext) != 0)
2683                         break;
2684                     if (nextData->inputBlobsId.size() != 2)
2685                         break;
2686
2687                     if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) == "sum")
2688                     {
2689                         if (nextData->params.has("coeff"))
2690                         {
2691                             DictValue paramCoeff = nextData->params.get("coeff");
2692                             int n = paramCoeff.size();
2693                             bool isCoeffOneOne = (n == 2);
2694                             for (int i = 0; isCoeffOneOne && i < n; i++)
2695                             {
2696                                 float c = paramCoeff.get<float>(i);
2697                                 isCoeffOneOne &= (c == 1.0f);
2698                             }
2699                             if (!isCoeffOneOne)
2700                             {
2701                                 CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion of 'Sum' without coeffs (or {1.0, 1.0}) is supported only");
2702                                 break;
2703                             }
2704                         }
2705                     }
2706                     else
2707                     {
2708                         CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion with eltwise operation is not supported: " << nextData->params.get<String>("operation"));
2709                         break;
2710                     }
2711
2712                     {
2713                         LayerData *eltwiseData = nextData;
2714
2715                         // Eltwise layer has two inputs. We need to determine which
2716                         // is a base convolution layer and which could be used as it's bias.
2717                         LayerData* biasLayerData = 0;
2718                         for (int i = 0; i < 2; ++i)
2719                         {
2720                             LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
2721                             CV_Assert(downLayerData);
2722                             while (downLayerData->skip)
2723                             {
2724                                 if (downLayerData->inputBlobsId.size() == 1)
2725                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
2726                                 else
2727                                 {
2728                                     downLayerData = 0;
2729                                     break;
2730                                 }
2731                             }
2732                             if (downLayerData && ld.id == downLayerData->id)
2733                             {
2734                                 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
2735                                 break;
2736                             }
2737                         }
2738                         CV_Assert(biasLayerData);
2739                         {
2740                             // fuse eltwise + activation layer
2741                             // bias must already be computed to fuse => bias layer must appear before convolution
2742                             if (biasLayerData->id < ld.id)
2743                             {
2744                                 /* we can fuse activation if:
2745                                  * => activation layer that follows is the only consumer of eltwise output
2746                                  * => activation layer does not process multiple inputs
2747                                  * => we do not require to keep the output of eltwise
2748                                  */
2749                                 Ptr<ActivationLayer> nextFusabeleActivLayer;
2750                                 if (eltwiseData->consumers.size() == 1 && pinsToKeep.count(lpNext) == 0)
2751                                 {
2752                                     nextData = &layers[eltwiseData->consumers[0].lid];
2753                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
2754                                     CV_Assert(nextData);
2755                                     if (nextData->outputBlobs.size() == 1)
2756                                         nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2757                                 }
2758                                 else
2759                                 {
2760                                     // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise
2761                                     nextData = 0;
2762                                 }
2763
2764                                 // the requirements of OCV OpenCL backend and CUDA backend are different
2765                                 // we need to check them separately; hence, the fuse variables
2766                                 bool fuse_eltwise = false, fuse_activation = false;
2767
2768                                 Ptr<PowerLayer> activ_power;
2769                                 if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() &&
2770                                     nextData &&
2771                                     (!nextData->type.compare("ReLU") ||
2772                                      !nextData->type.compare("ChannelsPReLU") ||
2773                                      (!nextData->type.compare("Power") && (activ_power = nextFusabeleActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
2774                                     ) &&
2775                                     currLayer->setActivation(nextFusabeleActivLayer))
2776                                 {
2777                                     fuse_eltwise = true;
2778                                     fuse_activation = true;
2779                                 }
2780
2781                                 if (IS_DNN_CUDA_TARGET(preferableTarget))
2782                                 {
2783                                     /* supported fusion options:
2784                                      * => convolution + eltwise
2785                                      * => activation(convolution) + eltwise
2786                                      *    > convolution + activation would have been fused already; we have to fuse eltwise
2787                                      * => activation(convolution + eltwise)
2788                                      *    > fuse eltwise and then activation
2789                                      */
2790                                     auto layer = nextEltwiseLayer.staticCast<Layer>();
2791                                     if (currLayer->tryFuse(layer))
2792                                     {
2793                                         fuse_eltwise = true; /* eltwise was successfully fused */
2794                                         if (!nextFusabeleActivLayer.empty() && nextData)
2795                                         {
2796                                             if ((!nextData->type.compare("ReLU") ||
2797                                                  !nextData->type.compare("ReLU6") ||
2798                                                  !nextData->type.compare("Power") ||
2799                                                  !nextData->type.compare("TanH") ||
2800                                                  !nextData->type.compare("Sigmoid") ||
2801                                                  !nextData->type.compare("Swish") ||
2802                                                  !nextData->type.compare("Mish")) &&
2803                                                 currLayer->setActivation(nextFusabeleActivLayer))
2804                                             {
2805                                                 // activation was fused
2806                                                 fuse_activation = true;
2807                                             }
2808                                         }
2809                                     }
2810                                 }
2811
2812                                 CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */
2813                                 if(fuse_eltwise && fuse_activation)
2814                                 {
2815                                     CV_Assert(nextData);
2816                                     CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
2817                                     ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
2818                                     printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
2819                                     printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str()));
2820                                     eltwiseData->skip = true;
2821                                     nextData->skip = true;
2822                                     // This optimization for cases like
2823                                     // some_layer   conv
2824                                     //   |             |
2825                                     //   +-- eltwise --+
2826                                     //          |
2827                                     //        activ
2828                                     // This way all the element-wise computations
2829                                     // (i.e. some_layer+conv or some_layer*conv)
2830                                     // would be done at [conv] layer. So we need to
2831                                     // replace [conv]'s output blob to [eltwise]'s one
2832                                     // considering that [activ] is an in-place layer.
2833                                     // Also we need to move all the consumers' references.
2834                                     // To prevent memory collisions (i.e. when input of
2835                                     // [conv] and output of [eltwise] is the same blob)
2836                                     // we allocate a new blob.
2837                                     CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
2838                                     ld.outputBlobs[0] = ld.outputBlobs[0].clone();
2839                                     ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
2840
2841                                     eltwiseData->outputBlobs = ld.outputBlobs;
2842                                     nextData->outputBlobs = ld.outputBlobs;
2843                                     eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
2844                                     nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
2845
2846                                     // Move references of [activ] layer consumers to the newly allocated blob.
2847                                     for (int i = 0; i < nextData->consumers.size(); ++i)
2848                                     {
2849                                         LayerData& consumer = layers[nextData->consumers[i].lid];
2850                                         for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
2851                                         {
2852                                             if (consumer.inputBlobsId[j].lid == lpNext.lid)
2853                                             {
2854                                                 consumer.inputBlobs[j] = &ld.outputBlobs[0];
2855                                                 consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
2856                                                 break;
2857                                             }
2858                                         }
2859                                     }
2860                                 }
2861                                 else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise)
2862                                 {
2863                                     CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
2864                                     CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
2865                                     ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
2866                                     printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
2867                                     eltwiseData->skip = true;
2868                                     // This optimization is for cases like
2869                                     // some_layer   conv (maybe fused with activ)
2870                                     //   |             |
2871                                     //   +-- eltwise --+
2872                                     //
2873                                     // This way all the element-wise computations
2874                                     // (i.e. some_layer+conv or some_layer*conv)
2875                                     // would be done at [conv] layer. So we need to
2876                                     // replace [conv]'s output blob to [eltwise]'s one.
2877                                     // Also we need to move all the consumers' references.
2878                                     // To prevent memory collisions (i.e. when input of
2879                                     // [conv] and output of [eltwise] is the same blob)
2880                                     // we allocate a new blob.
2881                                     CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
2882                                     ld.outputBlobs[0] = ld.outputBlobs[0].clone();
2883                                     ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
2884
2885                                     eltwiseData->outputBlobs = ld.outputBlobs;
2886                                     eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
2887
2888                                     // Move references of [eltwise] layer consumers to the newly allocated blob.
2889                                     for (int i = 0; i < eltwiseData->consumers.size(); ++i)
2890                                     {
2891                                         LayerData& consumer = layers[eltwiseData->consumers[i].lid];
2892                                         for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
2893                                         {
2894                                             if (consumer.inputBlobsId[j].lid == eltwiseData->id)
2895                                             {
2896                                                 consumer.inputBlobs[j] = &ld.outputBlobs[0];
2897                                                 consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
2898                                                 break;
2899                                             }
2900                                         }
2901                                     }
2902                                 }
2903                             }
2904                         }
2905                     }
2906
2907                     break;
2908                 }
2909             }
2910
2911             if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
2912                 continue;  // Go to the next layer.
2913
2914             // the optimization #2. if there is concat layer that concatenates channels
2915             // from the inputs together (i.e. axis == 1) then we make the inputs of
2916             // the concat layer to write to the concatenation output buffer
2917             // (and so we eliminate the concatenation layer, because the channels
2918             // are concatenated implicitly).
2919             Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
2920             if( !concatLayer.empty() && !concatLayer->padding && ld.outputBlobs.size() == 1 )
2921             {
2922                 Mat& output = ld.outputBlobs[0];
2923                 UMat umat_output;
2924 #ifdef HAVE_OPENCL
2925                 if (!ld.outputBlobsWrappers.empty() &&
2926                     (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
2927                 {
2928                     size_t i, ninputs = ld.inputBlobsId.size();
2929                     bool conv_layer = true;
2930                     for( i = 0; i < ninputs; i++ )
2931                     {
2932                         LayerPin pin = ld.inputBlobsId[i];
2933                         LayerData* inp_i_data = &layers[pin.lid];
2934                         while(inp_i_data->skip &&
2935                               inp_i_data->inputBlobsId.size() == 1 &&
2936                               inp_i_data->consumers.size() == 1)
2937                         {
2938                             pin = inp_i_data->inputBlobsId[0];
2939                             inp_i_data = &layers[pin.lid];
2940                         }
2941                         conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
2942                     }
2943                     if (!conv_layer)
2944                         continue;
2945                     std::vector<UMat> umat_outputBlobs;
2946                     umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2947                     umat_output = umat_outputBlobs[0];
2948                 }
2949 #endif
2950
2951                 // TODO: in general, this optimization can always be done, but
2952                 // many layers currently check that the input/output blobs are
2953                 // continuous arrays. Unfortunately, this is not true when
2954                 // the concatenation optimization is applied with batch_size > 1.
2955                 // so, for now, we only apply this optimization in the most popular
2956                 // case batch_size == 1.
2957                 int axis = clamp(concatLayer->axis, output.dims);
2958                 if( output.total(0, axis) == 1 )
2959                 {
2960                     size_t i, ninputs = ld.inputBlobsId.size();
2961                     std::vector<LayerPin> realinputs(ninputs);
2962                     for( i = 0; i < ninputs; i++ )
2963                     {
2964                         LayerPin pin = ld.inputBlobsId[i];
2965                         LayerData* inp_i_data = &layers[pin.lid];
2966                         while(inp_i_data->skip &&
2967                               inp_i_data->inputBlobsId.size() == 1 &&
2968                               inp_i_data->consumers.size() == 1)
2969                         {
2970                             pin = inp_i_data->inputBlobsId[0];
2971                             inp_i_data = &layers[pin.lid];
2972                         }
2973                         printf_(("\treal input for %s is %s\n",
2974                                layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
2975                                inp_i_data->getLayerInstance()->name.c_str()));
2976
2977                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
2978                             break;
2979 #ifdef HAVE_CUDA
2980                         if (preferableBackend == DNN_BACKEND_CUDA &&
2981                             (inp_i_data->layerInstance->supportBackend(DNN_BACKEND_CUDA) == false ||
2982                              (inp_i_data->layerInstance->type != "Convolution" &&
2983                               inp_i_data->layerInstance->type != "Pooling" &&
2984                               inp_i_data->layerInstance->type != "Resize"  &&
2985                               inp_i_data->layerInstance->type != "Flatten" &&
2986                               inp_i_data->layerInstance->type != "Permute" &&
2987                               inp_i_data->layerInstance->type != "Reorg" &&
2988                               inp_i_data->layerInstance->type != "Eltwise" &&
2989                               inp_i_data->layerInstance.dynamicCast<ActivationLayer>().empty())))
2990                         {
2991                             break;
2992                         }
2993 #endif
2994                         realinputs[i] = pin;
2995                     }
2996
2997                     if( i >= ninputs )
2998                     {
2999                         // Allocate new memory to prevent collisions during memory
3000                         // reusing (see https://github.com/opencv/opencv/pull/10456).
3001                         output = output.clone();
3002 #ifdef HAVE_OPENCL
3003                         if (preferableBackend == DNN_BACKEND_OPENCV &&
3004                             IS_DNN_OPENCL_TARGET(preferableTarget))
3005                         {
3006                             std::vector<UMat> umats(1);
3007                             umat_output = umat_output.clone();
3008                             umats[0] = umat_output;
3009                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
3010                         }
3011 #endif
3012
3013 #ifdef HAVE_CUDA
3014                         if (preferableBackend == DNN_BACKEND_CUDA)
3015                             ld.outputBlobsWrappers[0] = wrap(output);
3016 #endif
3017                         std::vector<Range> chrange(output.dims, Range::all());
3018                         int ofs = 0;
3019                         for( i = 0; i < ninputs; i++ )
3020                         {
3021                             LayerPin pin = realinputs[i];
3022                             LayerData* inp_i_data = &layers[pin.lid];
3023                             int channels_i = ld.inputBlobs[i]->size[axis];
3024                             chrange[axis] = Range(ofs, ofs + channels_i);
3025                             printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
3026                                    pin.oid, ofs, ofs + channels_i));
3027                             ofs += channels_i;
3028                             Mat output_slice = output(chrange);
3029                             Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
3030                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
3031                             Mat* oldPtr = &curr_output;
3032                             curr_output = output_slice;
3033 #ifdef HAVE_OPENCL
3034                             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
3035                             {
3036                                 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
3037                                 umats[pin.oid] = umat_output(chrange);
3038                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
3039                             }
3040 #endif
3041 #ifdef HAVE_CUDA
3042                             if (preferableBackend == DNN_BACKEND_CUDA)
3043                             {
3044                                 auto cuda_wrapper = wrap(output).dynamicCast<CUDABackendWrapper>();
3045                                 auto offset = chrange[axis].start * output_slice.total(axis + 1, output.dims);
3046                                 auto new_shape = shape(output_slice);
3047                                 cuda_wrapper->update(new_shape, offset);
3048                                 inp_i_data->outputBlobsWrappers[pin.oid] = cuda_wrapper.staticCast<BackendWrapper>();
3049                             }
3050 #endif
3051                             // Layers that refer old input Mat will refer to the
3052                             // new data but the same Mat object.
3053                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
3054                         }
3055
3056 #ifdef HAVE_CUDA
3057                         if (preferableBackend == DNN_BACKEND_CUDA)
3058                         {
3059                             for (int i = 0; i < ld.consumers.size(); i++)
3060                             {
3061                                 LayerData& consumer = layers[ld.consumers[i].lid];
3062                                 for (int j = 0; j < consumer.inputBlobsId.size(); j++)
3063                                 {
3064                                     if (consumer.inputBlobsId[j].lid == ld.id)
3065                                     {
3066                                         CV_Assert(consumer.inputBlobs[j]->data == ld.outputBlobs[0].data);
3067                                         consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
3068                                         break;
3069                                     }
3070                                 }
3071                             }
3072                         }
3073 #endif
3074                         ld.skip = true;
3075                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
3076                     }
3077                 }
3078             }
3079         }
3080     }
3081
3082     void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
3083     {
3084         CV_TRACE_FUNCTION();
3085
3086         MapIdToLayerData::iterator it;
3087         for (it = layers.begin(); it != layers.end(); it++)
3088             it->second.flag = 0;
3089
3090         CV_Assert(!layers[0].outputBlobs.empty());
3091         ShapesVec inputShapes;
3092         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
3093         {
3094             Mat& inp = layers[0].outputBlobs[i];
3095             CV_Assert(inp.total());
3096             if (preferableBackend == DNN_BACKEND_OPENCV &&
3097                 preferableTarget == DNN_TARGET_OPENCL_FP16)
3098             {
3099                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
3100             }
3101             inputShapes.push_back(shape(inp));
3102         }
3103         LayersShapesMap layersShapes;
3104         getLayersShapes(inputShapes, layersShapes);
3105
3106         blobManager.reset();
3107         backendWrappers.clear();
3108
3109         for(auto& layer : layers)
3110         {
3111             auto& ld = layer.second;
3112             ld.inputBlobsWrappers.clear();
3113             ld.outputBlobsWrappers.clear();
3114             ld.internalBlobsWrappers.clear();
3115         }
3116
3117         // Fake references to input blobs.
3118         for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
3119             blobManager.addReference(LayerPin(0, i));
3120         for (it = layers.begin(); it != layers.end(); ++it)
3121         {
3122             const LayerData& ld = it->second;
3123             blobManager.addReferences(ld.inputBlobsId);
3124         }
3125
3126         for (int i = 0; i < blobsToKeep_.size(); i++)
3127         {
3128             blobManager.addReference(blobsToKeep_[i]);
3129         }
3130
3131         for (it = layers.begin(); it != layers.end(); it++)
3132         {
3133             int lid = it->first;
3134             allocateLayer(lid, layersShapes);
3135         }
3136
3137         layersTimings.resize(lastLayerId + 1, 0);
3138         fuseLayers(blobsToKeep_);
3139     }
3140
3141     void forwardLayer(LayerData &ld)
3142     {
3143         CV_TRACE_FUNCTION();
3144
3145         Ptr<Layer> layer = ld.layerInstance;
3146
3147         if( !ld.skip )
3148         {
3149             TickMeter tm;
3150             tm.start();
3151
3152             std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
3153             if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
3154             {
3155                 if (isAsync)
3156                     CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
3157
3158                 if (!layer->supportBackend(DNN_BACKEND_OPENCV))
3159                     CV_Error(Error::StsNotImplemented, format("Layer \"%s\" of type \"%s\" unsupported on OpenCV backend",
3160                                                        ld.name.c_str(), ld.type.c_str()));
3161
3162 #ifdef HAVE_OPENCL
3163                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
3164                 {
3165                     std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
3166                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
3167                     std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
3168                     layer->forward(umat_inputBlobs,
3169                                    umat_outputBlobs,
3170                                    umat_internalBlobs);
3171                     if (DNN_CHECK_NAN_INF)
3172                     {
3173                         bool fail = false;
3174                         for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
3175                         {
3176                             UMat& u = umat_outputBlobs[i];
3177                             Mat m;
3178                             if (u.depth() == CV_16S) // FP16
3179                                 convertFp16(u, m);
3180                             else
3181                                 m = u.getMat(ACCESS_READ);
3182                             if (!checkRange(m))
3183                             {
3184                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
3185                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
3186                                 fail = true;
3187                             }
3188                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
3189                             {
3190                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
3191                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
3192                                 fail = true;
3193                             }
3194                         }
3195                         if (fail)
3196                         {
3197                             for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
3198                             {
3199                                 UMat& u = umat_inputBlobs[i];
3200                                 Mat m;
3201                                 if (u.depth() == CV_16S) // FP16
3202                                     convertFp16(u, m);
3203                                 else
3204                                     m = u.getMat(ACCESS_READ);
3205                                 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
3206                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
3207                             }
3208                             for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
3209                             {
3210                                 UMat& u = umat_outputBlobs[i];
3211                                 Mat m;
3212                                 if (u.depth() == CV_16S) // FP16
3213                                     convertFp16(u, m);
3214                                 else
3215                                     m = u.getMat(ACCESS_READ);
3216                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
3217                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
3218                             }
3219                             for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
3220                             {
3221                                 UMat& u = umat_internalBlobs[i];
3222                                 Mat m;
3223                                 if (u.depth() == CV_16S) // FP16
3224                                     convertFp16(u, m);
3225                                 else
3226                                     m = u.getMat(ACCESS_READ);
3227                                 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
3228                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
3229                             }
3230                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
3231                                 CV_Assert(!fail);
3232                         }
3233                     }
3234                     OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
3235                 }
3236                 else
3237 #endif
3238                 {
3239                     for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
3240                     {
3241                         if (!ld.inputBlobsWrappers[i].empty())
3242                             ld.inputBlobsWrappers[i]->copyToHost();
3243                     }
3244
3245                     std::vector<Mat> inps(ld.inputBlobs.size());
3246                     for (int i = 0; i < ld.inputBlobs.size(); ++i)
3247                     {
3248                         inps[i] = *ld.inputBlobs[i];
3249                     }
3250                     layer->forward(inps, ld.outputBlobs, ld.internals);
3251
3252                     if (DNN_CHECK_NAN_INF)
3253                     {
3254                         bool fail = false;
3255                         for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
3256                         {
3257                             const Mat& m = ld.outputBlobs[i];
3258                             if (!checkRange(m))
3259                             {
3260                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
3261                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
3262                                 fail = true;
3263                             }
3264                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
3265                             {
3266                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
3267                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
3268                                 fail = true;
3269                             }
3270                         }
3271                         if (fail)
3272                         {
3273                             for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
3274                             {
3275                                 const Mat* pM = ld.inputBlobs[i];
3276                                 if (!pM)
3277                                 {
3278                                     std::cout << "INPUT " << i << " is NULL" << std::endl;
3279                                     continue;
3280                                 }
3281                                 const Mat& m = *pM;
3282                                 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
3283                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
3284                             }
3285                             for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
3286                             {
3287                                 const Mat& m = ld.outputBlobs[i];
3288                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
3289                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
3290                             }
3291                             for (size_t i = 0; i < ld.internals.size(); ++i)
3292                             {
3293                                 const Mat& m = ld.internals[i];
3294                                 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
3295                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
3296                             }
3297                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
3298                                 CV_Assert(!fail);
3299                         }
3300                     }
3301
3302                     for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
3303                     {
3304                         if (!ld.outputBlobsWrappers[i].empty())
3305                             ld.outputBlobsWrappers[i]->setHostDirty();
3306                     }
3307                 }
3308             }
3309             else
3310             {
3311                 Ptr<BackendNode> node = it->second;
3312                 CV_Assert(!node.empty());
3313                 if (preferableBackend == DNN_BACKEND_CUDA)
3314                 {
3315                     CV_Assert(haveCUDA());
3316
3317 #ifdef HAVE_CUDA
3318                     Ptr<CUDABackendNode> cudaNode = node.dynamicCast<CUDABackendNode>();
3319                     CV_Assert(!cudaNode.empty());
3320
3321                     cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
3322
3323                     for (auto id : ld.cudaD2HBackgroundTransfers)
3324                     {
3325                         auto wrapper = ld.outputBlobsWrappers[id].dynamicCast<CUDABackendWrapper>();
3326                         wrapper->copyToHostInBackground();
3327                     }
3328 #endif
3329                 }
3330                 else if (preferableBackend == DNN_BACKEND_HALIDE)
3331                 {
3332                     forwardHalide(ld.outputBlobsWrappers, node);
3333                 }
3334                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
3335                 {
3336                     forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
3337                 }
3338                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
3339                 {
3340                     forwardNgraph(ld.outputBlobsWrappers, node, isAsync);
3341                 }
3342                 else if (preferableBackend == DNN_BACKEND_VKCOM)
3343                 {
3344                     try
3345                     {
3346                         forwardVkCom(ld.outputBlobsWrappers, node);
3347                     }
3348                     catch (const cv::Exception& e)
3349                     {
3350                         CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what());
3351                         it->second = Ptr<BackendNode>();
3352                         forwardLayer(ld);
3353                     }
3354                 }
3355                 else
3356                 {
3357                     CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
3358                 }
3359             }
3360
3361             tm.stop();
3362             int64 t = tm.getTimeTicks();
3363             layersTimings[ld.id] = (t > 0) ? t : t + 1;  // zero for skipped layers only
3364         }
3365         else
3366         {
3367             layersTimings[ld.id] = 0;
3368         }
3369
3370         ld.flag = 1;
3371     }
3372
3373     void forwardToLayer(LayerData &ld, bool clearFlags = true)
3374     {
3375         CV_TRACE_FUNCTION();
3376
3377         if (clearFlags)
3378         {
3379             MapIdToLayerData::iterator it;
3380             for (it = layers.begin(); it != layers.end(); it++)
3381                 it->second.flag = 0;
3382         }
3383
3384         //already was forwarded
3385         if (ld.flag)
3386             return;
3387
3388         //forward parents
3389         MapIdToLayerData::iterator it;
3390         for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
3391         {
3392             LayerData &ld = it->second;
3393             if (ld.flag)
3394                 continue;
3395             forwardLayer(ld);
3396         }
3397
3398         //forward itself
3399         forwardLayer(ld);
3400
3401 #ifdef HAVE_CUDA
3402         if (preferableBackend == DNN_BACKEND_CUDA)
3403             cudaInfo->context.stream.synchronize();
3404 #endif
3405     }
3406
3407     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
3408     {
3409         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
3410
3411         if (id == 0 && inOutShapes[id].in[0].empty())
3412         {
3413             if (!layers[0].outputBlobs.empty())
3414             {
3415                 ShapesVec shapes;
3416                 for (int i = 0; i < layers[0].outputBlobs.size(); i++)
3417                 {
3418                     Mat& inp = layers[0].outputBlobs[i];
3419                     CV_Assert(inp.total());
3420                     shapes.push_back(shape(inp));
3421                 }
3422                 inOutShapes[0].in = shapes;
3423             }
3424             else
3425             {
3426                 const std::vector<MatShape>& inputShapes = netInputLayer->shapes;
3427                 bool none = true;
3428                 for (size_t i = 0; i < inputShapes.size(); i++)
3429                 {
3430                     if (!inputShapes[i].empty())
3431                     {
3432                         none = false;
3433                         break;
3434                     }
3435                 }
3436                 if (none)
3437                 {
3438                     inOutShapes[0].out.clear();
3439                     return;
3440                 }
3441                 else
3442                 {
3443                     inOutShapes[0].in = inputShapes;
3444                 }
3445             }
3446         }
3447
3448         if (inOutShapes[id].in.empty())
3449         {
3450             for(int i = 0; i < inputLayerIds.size(); i++)
3451             {
3452                 int layerId = inputLayerIds[i].lid;
3453                 LayersShapesMap::iterator it =
3454                         inOutShapes.find(layerId);
3455                 if(it == inOutShapes.end() ||
3456                         it->second.out.empty())
3457                 {
3458                     getLayerShapesRecursively(layerId, inOutShapes);
3459                 }
3460                 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
3461                 inOutShapes[id].in.push_back(shape);
3462             }
3463         }
3464         const ShapesVec& is = inOutShapes[id].in;
3465         ShapesVec& os = inOutShapes[id].out;
3466         ShapesVec& ints = inOutShapes[id].internal;
3467         int requiredOutputs = layers[id].requiredOutputs.size();
3468         Ptr<Layer> l = layers[id].getLayerInstance();
3469         CV_Assert(l);
3470         bool layerSupportInPlace = false;
3471         try
3472         {
3473             layerSupportInPlace = l->getMemoryShapes(is, requiredOutputs, os, ints);
3474         }
3475         catch (const cv::Exception& e)
3476         {
3477             CV_LOG_ERROR(NULL, "OPENCV/DNN: [" << l->type << "]:(" << l->name << "): getMemoryShapes() throws exception." <<
3478                     " inputs=" << is.size() <<
3479                     " outputs=" << os.size() << "/" << requiredOutputs <<
3480                     " blobs=" << l->blobs.size());
3481             for (size_t i = 0; i < is.size(); ++i)
3482             {
3483                 CV_LOG_ERROR(NULL, "    input[" << i << "] = " << toString(is[i]));
3484             }
3485             for (size_t i = 0; i < os.size(); ++i)
3486             {
3487                 CV_LOG_ERROR(NULL, "    output[" << i << "] = " << toString(os[i]));
3488             }
3489             for (size_t i = 0; i < l->blobs.size(); ++i)
3490             {
3491                 CV_LOG_ERROR(NULL, "    blobs[" << i << "] = " << typeToString(l->blobs[i].type()) << " " << toString(shape(l->blobs[i])));
3492             }
3493             CV_LOG_ERROR(NULL, "Exception message: " << e.what());
3494             throw;
3495         }
3496         inOutShapes[id].supportInPlace = layerSupportInPlace;
3497
3498         for (int i = 0; i < ints.size(); i++)
3499             CV_Assert(total(ints[i]) > 0);
3500
3501         for (int i = 0; i < os.size(); i++)
3502             CV_Assert(total(os[i]) > 0);
3503     }
3504
3505     void getLayersShapes(const ShapesVec& netInputShapes,
3506                          LayersShapesMap& inOutShapes)
3507     {
3508         inOutShapes.clear();
3509
3510         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
3511         for (MapIdToLayerData::iterator it = layers.begin();
3512              it != layers.end(); it++)
3513         {
3514             getLayerShapesRecursively(it->first, inOutShapes);
3515         }
3516     }
3517
3518     void getLayerShapes(const ShapesVec& netInputShapes,
3519                         const int layerId,
3520                         LayerShapes& shapes)
3521     {
3522         LayersShapesMap inOutShapes;
3523         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
3524         getLayerShapesRecursively(layerId, inOutShapes);
3525         shapes = inOutShapes[layerId];
3526     }
3527
3528     LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
3529     {
3530         return *std::max_element(pins.begin(), pins.end());
3531     }
3532
3533     Mat getBlob(const LayerPin& pin)
3534     {
3535         CV_TRACE_FUNCTION();
3536
3537         if (!pin.valid())
3538             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
3539
3540         LayerData &ld = layers[pin.lid];
3541         if ((size_t)pin.oid >= ld.outputBlobs.size())
3542         {
3543             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %zu outputs, "
3544                                            "the #%d was requested", ld.name.c_str(),
3545                                            ld.outputBlobs.size(), pin.oid));
3546         }
3547         if (preferableTarget != DNN_TARGET_CPU)
3548         {
3549             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
3550             // Transfer data to CPU if it's require.
3551             ld.outputBlobsWrappers[pin.oid]->copyToHost();
3552         }
3553
3554         if (ld.outputBlobs[pin.oid].depth() == CV_16S)
3555         {
3556             convertFp16(ld.outputBlobs[pin.oid], output_blob);
3557             return output_blob;
3558         }
3559         else
3560             return ld.outputBlobs[pin.oid];
3561     }
3562
3563     Mat getBlob(String outputName)
3564     {
3565         return getBlob(getPinByAlias(outputName));
3566     }
3567
3568 #ifdef CV_CXX11
3569     AsyncArray getBlobAsync(const LayerPin& pin)
3570     {
3571         CV_TRACE_FUNCTION();
3572 #ifdef HAVE_INF_ENGINE
3573         if (!pin.valid())
3574             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
3575
3576         LayerData &ld = layers[pin.lid];
3577         if ((size_t)pin.oid >= ld.outputBlobs.size())
3578         {
3579             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
3580                                            "the #%d was requested", ld.name.c_str(),
3581                                            (int)ld.outputBlobs.size(), (int)pin.oid));
3582         }
3583         if (preferableTarget != DNN_TARGET_CPU)
3584         {
3585             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
3586             // Transfer data to CPU if it's require.
3587             ld.outputBlobsWrappers[pin.oid]->copyToHost();
3588         }
3589         CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH);
3590
3591         if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019) {
3592 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
3593             Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
3594             return std::move(wrapper->futureMat);
3595 #else
3596             CV_Error(Error::StsNotImplemented, "This OpenCV version is built without Inference Engine NN Builder API support");
3597 #endif
3598         }
3599         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
3600         {
3601 #ifdef HAVE_DNN_NGRAPH
3602             Ptr<NgraphBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<NgraphBackendWrapper>();
3603             return std::move(wrapper->futureMat);
3604 #else
3605             CV_Error(Error::StsNotImplemented, "This OpenCV version is built without support of Inference Engine + nGraph");
3606 #endif
3607         }
3608 #endif  // HAVE_INF_ENGINE
3609         CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 backend is required");
3610     }
3611
3612     AsyncArray getBlobAsync(String outputName)
3613     {
3614         return getBlobAsync(getPinByAlias(outputName));
3615     }
3616 #endif  // CV_CXX11
3617
3618 #ifdef HAVE_INF_ENGINE
3619     static
3620     Net createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork& ieNet);
3621 #endif
3622
3623     string dump();
3624
3625     void dumpNetworkToFile()
3626     {
3627 #ifndef OPENCV_DNN_DISABLE_NETWORK_AUTO_DUMP
3628         string dumpFileNameBase = getDumpFileNameBase();
3629         string dumpFileName = dumpFileNameBase + ".dot";
3630         try
3631         {
3632             string dumpStr = dump();
3633             std::ofstream out(dumpFileName.c_str(), std::ios::out | std::ios::binary);
3634             out << dumpStr;
3635         }
3636         catch (const std::exception& e)
3637         {
3638             std::ofstream out((dumpFileName + ".error").c_str(), std::ios::out);
3639             out << "Exception: " << e.what() << std::endl;
3640         }
3641         catch (...)
3642         {
3643             std::ofstream out((dumpFileName + ".error").c_str(), std::ios::out);
3644             out << "Can't dump: unknown exception" << std::endl;
3645         }
3646 #endif
3647     }
3648 };
3649
3650 Net::Net() : impl(new Net::Impl)
3651 {
3652 }
3653
3654 #ifdef HAVE_INF_ENGINE
3655 /*static*/
3656 Net Net::Impl::createNetworkFromModelOptimizer(InferenceEngine::CNNNetwork& ieNet)
3657 {
3658     CV_TRACE_FUNCTION();
3659
3660     CV_TRACE_REGION("register_inputs");
3661
3662     std::vector<String> inputsNames;
3663     std::vector<MatShape> inp_shapes;
3664     for (auto& it : ieNet.getInputsInfo())
3665     {
3666         inputsNames.push_back(it.first);
3667         std::vector<size_t> dims = it.second->getTensorDesc().getDims();
3668         inp_shapes.push_back(std::vector<int>(dims.begin(), dims.end()));
3669     }
3670
3671     Net cvNet;
3672     cvNet.setInputsNames(inputsNames);
3673
3674     // set empty input to determine input shapes
3675     for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
3676     {
3677         cvNet.setInputShape(inputsNames[inp_id], inp_shapes[inp_id]);
3678     }
3679
3680     CV_TRACE_REGION_NEXT("backendNode");
3681
3682     Ptr<BackendNode> backendNode;
3683 #ifdef HAVE_DNN_NGRAPH
3684     if (DNN_BACKEND_INFERENCE_ENGINE_NGRAPH == getInferenceEngineBackendTypeParam())
3685     {
3686         auto fake_node = std::make_shared<ngraph::op::Parameter>(ngraph::element::f32, ngraph::Shape{});
3687         Ptr<InfEngineNgraphNode> backendNodeNGraph(new InfEngineNgraphNode(fake_node));
3688         backendNodeNGraph->net = Ptr<InfEngineNgraphNet>(new InfEngineNgraphNet(*(cvNet.impl), ieNet));
3689         backendNode = backendNodeNGraph;
3690     }
3691     else
3692 #endif
3693     {
3694 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
3695         Ptr<InfEngineBackendNode> backendNodeNN(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
3696         backendNodeNN->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
3697         backendNode = backendNodeNN;
3698 #else
3699         CV_Error(Error::StsNotImplemented, "This OpenCV version is built without Inference Engine NN Builder API support");
3700 #endif
3701     }
3702
3703     CV_TRACE_REGION_NEXT("register_outputs");
3704
3705 #ifdef HAVE_DNN_NGRAPH
3706     auto ngraphFunction = ieNet.getFunction();
3707 #if INF_ENGINE_VER_MAJOR_LT(INF_ENGINE_RELEASE_2020_2)
3708     std::list< std::shared_ptr<ngraph::Node> > ngraphOperations;
3709 #else
3710     std::vector< std::shared_ptr<ngraph::Node> > ngraphOperations;
3711 #endif
3712     if (ngraphFunction)
3713     {
3714         ngraphOperations = ngraphFunction->get_ops();
3715     }
3716 #endif
3717
3718     for (auto& it : ieNet.getOutputsInfo())
3719     {
3720         CV_TRACE_REGION("output");
3721         const auto& outputName = it.first;
3722
3723         LayerParams lp;
3724         int lid = cvNet.addLayer(it.first, "", lp);
3725
3726         LayerData& ld = cvNet.impl->layers[lid];
3727
3728 #ifdef HAVE_DNN_NGRAPH
3729         if (DNN_BACKEND_INFERENCE_ENGINE_NGRAPH == getInferenceEngineBackendTypeParam())
3730         {
3731             Ptr<Layer> cvLayer(new NgraphBackendLayer(ieNet));
3732             cvLayer->name = outputName;
3733             cvLayer->type = "_unknown_";
3734
3735             auto process_layer = [&](const std::string& name) -> bool
3736             {
3737                 if (ngraphFunction)
3738                 {
3739                     CV_TRACE_REGION("ngraph_function");
3740                     for (const auto& op : ngraphOperations)
3741                     {
3742                         CV_Assert(op);
3743                         if (op->get_friendly_name() == name)
3744                         {
3745                             const std::string typeName = op->get_type_info().name;
3746                             cvLayer->type = typeName;
3747                             return true;
3748                         }
3749                     }
3750                     return false;
3751                 }
3752                 else
3753                 {
3754 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2020_4)
3755                     CV_Error(Error::StsNotImplemented, "This OpenCV version is built with Inference Engine which has dropped IR v7 support");
3756 #else
3757                     CV_TRACE_REGION("legacy_cnn_layer");
3758                     try
3759                     {
3760                         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(name.c_str());
3761                         CV_Assert(ieLayer);
3762
3763                         cvLayer->type = ieLayer->type;
3764                         return true;
3765                     }
3766                     catch (const std::exception& e)
3767                     {
3768                         CV_UNUSED(e);
3769                         CV_LOG_DEBUG(NULL, "IE layer extraction failure: '" << name << "' - " << e.what());
3770                         return false;
3771                     }
3772 #endif
3773
3774                 }
3775             };
3776
3777             bool found = process_layer(outputName);
3778             if (!found)
3779             {
3780                 auto pos = outputName.rfind('.');  // cut port number: ".0"
3781                 if (pos != std::string::npos)
3782                 {
3783                     std::string layerName = outputName.substr(0, pos);
3784                     found = process_layer(layerName);
3785                 }
3786             }
3787             if (!found)
3788                 CV_LOG_WARNING(NULL, "DNN/IE: Can't determine output layer type: '" << outputName << "'");
3789
3790             ld.layerInstance = cvLayer;
3791             ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE_NGRAPH] = backendNode;
3792         }
3793         else
3794 #endif
3795         {
3796 #ifdef HAVE_DNN_IE_NN_BUILDER_2019
3797             Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
3798
3799             InferenceEngine::CNNLayerPtr ieLayer;
3800             try
3801             {
3802                 ieLayer = ieNet.getLayerByName(outputName.c_str());
3803             }
3804             catch (...)
3805             {
3806                 auto pos = outputName.rfind('.');  // cut port number: ".0"
3807                 if (pos != std::string::npos)
3808                 {
3809                     std::string layerName = outputName.substr(0, pos);
3810                     ieLayer = ieNet.getLayerByName(layerName.c_str());
3811                 }
3812             }
3813             CV_Assert(ieLayer);
3814
3815             cvLayer->name = outputName;
3816             cvLayer->type = ieLayer->type;
3817             ld.layerInstance = cvLayer;
3818
3819             ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019] = backendNode;
3820 #else
3821             CV_Error(Error::StsNotImplemented, "This OpenCV version is built without Inference Engine NN Builder API support");
3822 #endif
3823         }
3824
3825         for (int i = 0; i < inputsNames.size(); ++i)
3826             cvNet.connect(0, i, lid, i);
3827     }
3828
3829     CV_TRACE_REGION_NEXT("finalize");
3830
3831     cvNet.setPreferableBackend(getInferenceEngineBackendTypeParam());
3832
3833     cvNet.impl->skipInfEngineInit = true;
3834     return cvNet;
3835 }
3836 #endif  // HAVE_INF_ENGINE
3837
3838 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
3839 {
3840     CV_TRACE_FUNCTION();
3841 #ifndef HAVE_INF_ENGINE
3842     CV_UNUSED(xml); CV_UNUSED(bin);
3843     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
3844 #else
3845 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R3)
3846     InferenceEngine::CNNNetReader reader;
3847     reader.ReadNetwork(xml);
3848     reader.ReadWeights(bin);
3849
3850     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
3851 #else
3852     InferenceEngine::Core& ie = getCore("");
3853     InferenceEngine::CNNNetwork ieNet = ie.ReadNetwork(xml, bin);
3854 #endif
3855
3856     return Impl::createNetworkFromModelOptimizer(ieNet);
3857 #endif  // HAVE_INF_ENGINE
3858 }
3859
3860 Net Net::readFromModelOptimizer(const std::vector<uchar>& bufferModelConfig, const std::vector<uchar>& bufferWeights)
3861 {
3862     CV_TRACE_FUNCTION();
3863     CV_Assert(!bufferModelConfig.empty());
3864     CV_Assert(!bufferWeights.empty());
3865     return readFromModelOptimizer(bufferModelConfig.data(), bufferModelConfig.size(),
3866                                            bufferWeights.data(), bufferWeights.size());
3867 }
3868
3869 Net Net::readFromModelOptimizer(
3870         const uchar* bufferModelConfigPtr, size_t bufferModelConfigSize,
3871         const uchar* bufferWeightsPtr, size_t bufferWeightsSize
3872 )
3873 {
3874     CV_TRACE_FUNCTION();
3875 #ifndef HAVE_INF_ENGINE
3876     CV_UNUSED(bufferModelConfigPtr); CV_UNUSED(bufferWeightsPtr);
3877     CV_UNUSED(bufferModelConfigSize); CV_UNUSED(bufferModelConfigSize);
3878     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
3879 #else
3880
3881 #if INF_ENGINE_VER_MAJOR_LE(INF_ENGINE_RELEASE_2019R3)
3882     InferenceEngine::CNNNetReader reader;
3883
3884     try
3885     {
3886         reader.ReadNetwork(bufferModelConfigPtr, bufferModelConfigSize);
3887
3888         InferenceEngine::TensorDesc tensorDesc(InferenceEngine::Precision::U8, { bufferWeightsSize }, InferenceEngine::Layout::C);
3889         InferenceEngine::TBlob<uint8_t>::Ptr weightsBlobPtr(new InferenceEngine::TBlob<uint8_t>(tensorDesc));
3890         weightsBlobPtr->allocate();
3891         std::memcpy(weightsBlobPtr->buffer(), (uchar*)bufferWeightsPtr, bufferWeightsSize);
3892         reader.SetWeights(weightsBlobPtr);
3893     }
3894     catch (const std::exception& e)
3895     {
3896         CV_Error(Error::StsError, std::string("DNN: IE failed to load model: ") + e.what());
3897     }
3898
3899     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
3900 #else
3901     InferenceEngine::Core& ie = getCore("");
3902
3903     std::string model; model.assign((char*)bufferModelConfigPtr, bufferModelConfigSize);
3904
3905     InferenceEngine::CNNNetwork ieNet;
3906     try
3907     {
3908         InferenceEngine::TensorDesc tensorDesc(InferenceEngine::Precision::U8, { bufferWeightsSize }, InferenceEngine::Layout::C);
3909         InferenceEngine::Blob::CPtr weights_blob = InferenceEngine::make_shared_blob<uint8_t>(tensorDesc, (uint8_t*)bufferWeightsPtr, bufferWeightsSize);
3910
3911         ieNet = ie.ReadNetwork(model, weights_blob);
3912     }
3913     catch (const std::exception& e)
3914     {
3915         CV_Error(Error::StsError, std::string("DNN: IE failed to load model: ") + e.what());
3916     }
3917 #endif
3918
3919     return Impl::createNetworkFromModelOptimizer(ieNet);
3920 #endif  // HAVE_INF_ENGINE
3921 }
3922
3923
3924 Net::~Net()
3925 {
3926 }
3927
3928 int Net::addLayer(const String &name, const String &type, LayerParams &params)
3929 {
3930     CV_TRACE_FUNCTION();
3931
3932     if (impl->getLayerId(name) >= 0)
3933     {
3934         CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
3935         return -1;
3936     }
3937
3938     int id = ++impl->lastLayerId;
3939     impl->layerNameToId.insert(std::make_pair(name, id));
3940     impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
3941
3942     return id;
3943 }
3944
3945 int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
3946 {
3947     CV_TRACE_FUNCTION();
3948
3949     int prvLid = impl->lastLayerId;
3950     int newLid = this->addLayer(name, type, params);
3951     this->connect(prvLid, 0, newLid, 0);
3952     return newLid;
3953 }
3954
3955 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
3956 {
3957     CV_TRACE_FUNCTION();
3958
3959     impl->connect(outLayerId, outNum, inpLayerId, inpNum);
3960 }
3961
3962 void Net::connect(String _outPin, String _inPin)
3963 {
3964     CV_TRACE_FUNCTION();
3965
3966     LayerPin outPin = impl->getPinByAlias(_outPin);
3967     LayerPin inpPin = impl->getPinByAlias(_inPin);
3968
3969     CV_Assert(outPin.valid() && inpPin.valid());
3970
3971     impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
3972 }
3973
3974 Mat Net::forward(const String& outputName)
3975 {
3976     CV_TRACE_FUNCTION();
3977     CV_Assert(!empty());
3978
3979     String layerName = outputName;
3980
3981     if (layerName.empty())
3982     {
3983         std::vector<String> layerNames = getLayerNames();
3984         CV_Assert(!layerNames.empty());
3985         layerName = layerNames.back();
3986     }
3987
3988     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
3989     impl->setUpNet(pins);
3990     impl->forwardToLayer(impl->getLayerData(layerName));
3991
3992     return impl->getBlob(layerName);
3993 }
3994
3995 AsyncArray Net::forwardAsync(const String& outputName)
3996 {
3997     CV_TRACE_FUNCTION();
3998     CV_Assert(!empty());
3999
4000 #ifdef CV_CXX11
4001     String layerName = outputName;
4002
4003     if (layerName.empty())
4004     {
4005         std::vector<String> layerNames = getLayerNames();
4006         CV_Assert(!layerNames.empty());
4007         layerName = layerNames.back();
4008     }
4009
4010     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
4011     impl->setUpNet(pins);
4012
4013     if (!(impl->preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || impl->preferableBackend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH))
4014         CV_Error(Error::StsNotImplemented, "DNN: Asynchronous forward is supported for Inference Engine backends only");
4015
4016     impl->isAsync = true;
4017     impl->forwardToLayer(impl->getLayerData(layerName));
4018     impl->isAsync = false;
4019
4020     return impl->getBlobAsync(layerName);
4021 #else
4022     CV_Error(Error::StsNotImplemented, "DNN: Asynchronous forward requires build with enabled C++11");
4023 #endif  // CV_CXX11
4024 }
4025
4026 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
4027 {
4028     CV_TRACE_FUNCTION();
4029     CV_Assert(!empty());
4030
4031     String layerName = outputName;
4032
4033     if (layerName.empty())
4034     {
4035         std::vector<String> layerNames = getLayerNames();
4036         CV_Assert(!layerNames.empty());
4037         layerName = layerNames.back();
4038     }
4039
4040     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
4041     impl->setUpNet(pins);
4042     impl->forwardToLayer(impl->getLayerData(layerName));
4043
4044     LayerPin pin = impl->getPinByAlias(layerName);
4045     LayerData &ld = impl->layers[pin.lid];
4046
4047     if (outputBlobs.isUMat())
4048     {
4049         impl->getBlob(layerName).copyTo(outputBlobs);
4050     }
4051     else if (outputBlobs.isMat())
4052     {
4053         outputBlobs.assign(impl->getBlob(layerName));
4054     }
4055     else if (outputBlobs.isMatVector())
4056     {
4057         if (impl->preferableTarget != DNN_TARGET_CPU)
4058         {
4059             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
4060             {
4061                 CV_Assert(!ld.outputBlobsWrappers[i].empty());
4062                 ld.outputBlobsWrappers[i]->copyToHost();
4063             }
4064         }
4065         if (ld.outputBlobs[0].depth() == CV_32F)
4066         {
4067             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
4068             outputvec = ld.outputBlobs;
4069         } else {
4070             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
4071             outputvec.resize(ld.outputBlobs.size());
4072             for (int i = 0; i < outputvec.size(); i++)
4073                 convertFp16(ld.outputBlobs[i], outputvec[i]);
4074         }
4075     }
4076     else if (outputBlobs.isUMatVector())
4077     {
4078         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
4079
4080 #ifdef HAVE_OPENCL
4081         if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
4082             IS_DNN_OPENCL_TARGET(impl->preferableTarget))
4083         {
4084             if (impl->preferableTarget == DNN_TARGET_OPENCL)
4085                 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
4086             else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
4087             {
4088                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
4089                 outputvec.resize(out_vec.size());
4090                 for (int i = 0; i < out_vec.size(); i++)
4091                     convertFp16(out_vec[i], outputvec[i]);
4092             }
4093         }
4094         else
4095 #endif
4096         {
4097             outputvec.resize(ld.outputBlobs.size());
4098             for (int i = 0; i < outputvec.size(); ++i)
4099                 ld.outputBlobs[i].copyTo(outputvec[i]);
4100         }
4101     }
4102 }
4103
4104 void Net::forward(OutputArrayOfArrays outputBlobs,
4105                   const std::vector<String>& outBlobNames)
4106 {
4107     CV_TRACE_FUNCTION();
4108
4109     std::vector<LayerPin> pins;
4110     for (int i = 0; i < outBlobNames.size(); i++)
4111     {
4112         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
4113     }
4114
4115     impl->setUpNet(pins);
4116
4117     LayerPin out = impl->getLatestLayerPin(pins);
4118
4119     impl->forwardToLayer(impl->getLayerData(out.lid));
4120
4121     std::vector<Mat> matvec;
4122     for (int i = 0; i < pins.size(); i++)
4123     {
4124         matvec.push_back(impl->getBlob(pins[i]));
4125     }
4126
4127     std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
4128     outputvec = matvec;
4129 }
4130
4131 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
4132                      const std::vector<String>& outBlobNames)
4133 {
4134     CV_TRACE_FUNCTION();
4135
4136     std::vector<LayerPin> pins;
4137     for (int i = 0; i < outBlobNames.size(); i++)
4138     {
4139         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
4140     }
4141
4142     impl->setUpNet(pins);
4143
4144     LayerPin out = impl->getLatestLayerPin(pins);
4145
4146     impl->forwardToLayer(impl->getLayerData(out.lid));
4147
4148     outputBlobs.resize(outBlobNames.size());
4149     for (int i = 0; i < outBlobNames.size(); i++)
4150     {
4151         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
4152         outputBlobs[i].resize(lp.size());
4153         for (int j = 0; j < lp.size(); j++)
4154         {
4155             outputBlobs[i][j] = impl->getBlob(lp[j]);
4156         }
4157     }
4158 }
4159
4160 void Net::setPreferableBackend(int backendId)
4161 {
4162     CV_TRACE_FUNCTION();
4163     CV_TRACE_ARG(backendId);
4164
4165 #ifdef HAVE_INF_ENGINE
4166     if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
4167         backendId = getInferenceEngineBackendTypeParam();
4168 #endif
4169
4170     if( impl->preferableBackend != backendId )
4171     {
4172         impl->preferableBackend = backendId;
4173         impl->netWasAllocated = false;
4174         impl->clear();
4175     }
4176 }
4177
4178 void Net::setPreferableTarget(int targetId)
4179 {
4180     CV_TRACE_FUNCTION();
4181     CV_TRACE_ARG(targetId);
4182
4183     if( impl->preferableTarget != targetId )
4184     {
4185         impl->preferableTarget = targetId;
4186         if (IS_DNN_OPENCL_TARGET(targetId))
4187         {
4188 #ifndef HAVE_OPENCL
4189 #ifdef HAVE_INF_ENGINE
4190             if (impl->preferableBackend == DNN_BACKEND_OPENCV)
4191 #else
4192             if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
4193                 impl->preferableBackend == DNN_BACKEND_OPENCV)
4194 #endif  // HAVE_INF_ENGINE
4195                 impl->preferableTarget = DNN_TARGET_CPU;
4196 #else
4197             bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
4198             if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
4199                 impl->preferableTarget = DNN_TARGET_OPENCL;
4200 #endif
4201         }
4202         impl->netWasAllocated = false;
4203         impl->clear();
4204     }
4205 }
4206
4207 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
4208 {
4209     CV_TRACE_FUNCTION();
4210
4211     impl->netInputLayer->setNames(inputBlobNames);
4212 }
4213
4214 void Net::setInputShape(const String &inputName, const MatShape& shape)
4215 {
4216     CV_TRACE_FUNCTION();
4217
4218     impl->netInputLayer->setInputShape(inputName, shape);
4219 }
4220
4221 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
4222 {
4223     CV_TRACE_FUNCTION();
4224     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
4225
4226     LayerPin pin;
4227     pin.lid = 0;
4228     pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
4229
4230     if (!pin.valid())
4231         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
4232
4233     Mat blob_ = blob.getMat();  // can't use InputArray directly due MatExpr stuff
4234     MatShape blobShape = shape(blob_);
4235
4236     if (pin.lid == 0)
4237     {
4238         CV_Assert(!impl->netInputLayer.empty());
4239         const DataLayer& netInputLayer = *impl->netInputLayer.get();
4240         if (!netInputLayer.shapes.empty())
4241         {
4242             CV_CheckLT(pin.oid, (int)netInputLayer.shapes.size(), "");
4243             const MatShape& inputShapeLimitation = netInputLayer.shapes[pin.oid];
4244             if (!inputShapeLimitation.empty())
4245             {
4246                 CV_CheckEQ(inputShapeLimitation.size(), blobShape.size(), "");
4247 #if 0  // TODO: DNNTestNetwork.MobileNet_SSD_Caffe_Different_Width_Height/0
4248                 const size_t dims = inputShapeLimitation.size();
4249                 for (size_t dim = 0; dim < dims; dim++)
4250                 {
4251                     if (dims >= 3 && dim == 0 && inputShapeLimitation[0] == 1)
4252                         continue;  // don't limit batch
4253                     CV_CheckEQ(inputShapeLimitation[dim], blobShape[dim], "");
4254                 }
4255 #endif
4256             }
4257         }
4258     }
4259
4260     LayerData &ld = impl->layers[pin.lid];
4261     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
4262     ld.outputBlobs.resize(numInputs);
4263     ld.outputBlobsWrappers.resize(numInputs);
4264     impl->netInputLayer->inputsData.resize(numInputs);
4265     impl->netInputLayer->scaleFactors.resize(numInputs);
4266     impl->netInputLayer->means.resize(numInputs);
4267
4268     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
4269     bool oldShape = prevShape == blobShape;
4270
4271     blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
4272     if (!oldShape)
4273         ld.outputBlobs[pin.oid] = impl->netInputLayer->inputsData[pin.oid];
4274
4275     if (!ld.outputBlobsWrappers[pin.oid].empty())
4276     {
4277         ld.outputBlobsWrappers[pin.oid]->setHostDirty();
4278     }
4279     impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
4280     impl->netInputLayer->means[pin.oid] = mean;
4281     impl->netWasAllocated = impl->netWasAllocated && oldShape;
4282 }
4283
4284 Mat Net::getParam(LayerId layer, int numParam)
4285 {
4286     LayerData &ld = impl->getLayerData(layer);
4287     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
4288     CV_Assert(numParam < (int)layerBlobs.size());
4289     return layerBlobs[numParam];
4290 }
4291
4292 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
4293 {
4294     LayerData &ld = impl->getLayerData(layer);
4295
4296     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
4297     CV_Assert(numParam < (int)layerBlobs.size());
4298     //we don't make strong checks, use this function carefully
4299     layerBlobs[numParam] = blob;
4300 }
4301
4302 int Net::getLayerId(const String &layer)
4303 {
4304     return impl->getLayerId(layer);
4305 }
4306
4307 static
4308 string dumpLayerParameterSize(const string& name, const LayerParams& lp)
4309 {
4310     std::ostringstream out(name, std::ios::ate);
4311     DictValue param = lp.get(name);
4312     switch (param.size())
4313     {
4314         case 1: out << " : "; break;
4315         case 2: out << " (HxW): "; break;
4316         case 3: out << " (DxHxW): "; break;
4317         default:
4318             CV_LOG_INFO(NULL, format("DNN/dumpLayerParameterSize(): Unsupported '%s' size = %d", name.c_str(), param.size()));
4319             out << ": ";
4320     }
4321     for (size_t i = 0; i < param.size(); i++)
4322     {
4323         if (i > 0)
4324             out << " x ";
4325         out << param.get<int>(i);
4326     }
4327     return out.str();
4328 }
4329
4330 String Net::dump()
4331 {
4332     CV_Assert(!empty());
4333
4334     bool hasInput = !impl->netInputLayer->inputsData.empty();
4335
4336     if (hasInput)
4337     {
4338         if (!impl->netWasAllocated)
4339             impl->setUpNet();
4340     }
4341
4342     return impl->dump();
4343 }
4344
4345 string Net::Impl::dump()
4346 {
4347     bool hasInput = !netInputLayer->inputsData.empty();
4348
4349     std::ostringstream out;
4350     const std::map<int, LayerData>& map = layers;
4351
4352     Backend prefBackend = (Backend)preferableBackend;
4353     std::vector<std::vector<int> > skippedLayers;
4354     std::vector<int> skipId;
4355     std::vector<int> allLayers(map.size(), -1);
4356     int idPrev = -1;
4357     Ptr<BackendNode> prevNode;
4358     for (std::map<int, LayerData>::const_reverse_iterator rit = map.rbegin(); rit != map.rend(); ++rit)
4359     {
4360         std::map<int, Ptr<BackendNode> >::const_iterator itBackend = rit->second.backendNodes.find(prefBackend);
4361         if (prefBackend == DNN_BACKEND_OPENCV || itBackend == rit->second.backendNodes.end() ||
4362             itBackend->second.empty())
4363         {
4364                 if (rit->second.skip)
4365                     skipId.push_back(rit->first);
4366                 else if (!skipId.empty())
4367                 {
4368                     if (prefBackend == DNN_BACKEND_OPENCV || prevNode.empty())
4369                         skipId.push_back(rit->first);
4370                     else if (idPrev != -1)
4371                         skipId.push_back(idPrev);
4372
4373                     std::sort(skipId.begin(), skipId.end());
4374                     for (int i = 0; i < skipId.size(); i++) {
4375                         allLayers[skipId[i]] = skippedLayers.size();
4376                     }
4377                     skippedLayers.push_back(skipId);
4378                     skipId.clear();
4379                 }
4380         }
4381         else
4382         {
4383             if (itBackend->second == prevNode)
4384                 skipId.push_back(idPrev);
4385             else if (!skipId.empty())
4386             {
4387                 skipId.push_back(idPrev);
4388                 std::sort(skipId.begin(), skipId.end());
4389                 for (int i = 0; i < skipId.size(); i++) {
4390                     allLayers[skipId[i]] = skippedLayers.size();
4391                 }
4392                 skippedLayers.push_back(skipId);
4393                 skipId.clear();
4394             }
4395             idPrev = rit->first;
4396             prevNode = itBackend->second;
4397         }
4398     }
4399     string colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848", "#b35151"};
4400     string backend;
4401     switch (prefBackend)
4402     {
4403         case DNN_BACKEND_DEFAULT: backend = "DEFAULT/"; break;
4404         case DNN_BACKEND_HALIDE: backend = "HALIDE/"; break;
4405         case DNN_BACKEND_INFERENCE_ENGINE: // fallthru
4406         case DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019: backend = "DLIE/"; break;
4407         case DNN_BACKEND_INFERENCE_ENGINE_NGRAPH: backend = "NGRAPH/"; break;
4408         case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
4409         case DNN_BACKEND_VKCOM: backend = "VULKAN/"; break;
4410         case DNN_BACKEND_CUDA: backend = "CUDA/"; break;
4411         // don't use default:
4412     }
4413     out << "digraph G {\n";
4414     // Add nodes
4415     for (std::map<int, LayerData>::const_iterator it = map.begin(); it != map.end(); ++it)
4416     {
4417         const LayerData& ld = it->second;
4418         string name = ld.params.name;
4419         std::vector<int> clusterIds(1, it->first);
4420         if (allLayers[it->first] == -1 && !name.empty())
4421         {
4422             out << "\t\"" << name << "\" [label=\"";
4423         }
4424         else if (name.empty() || it->first != skippedLayers[allLayers[it->first]][0])
4425         {
4426             continue;
4427         }
4428         else // first node in cluster : it->first == skippedLayers[allLayers[it->first]][0]
4429         {
4430             int cluster = allLayers[it->first];
4431             out << "\t\"" << "cluster_" << cluster << "\" [label=\"{";
4432             clusterIds = skippedLayers[allLayers[it->first]]; // vertices in current cluster
4433         }
4434         for (int i = 0; i < clusterIds.size(); i++)
4435         {
4436             CV_DbgAssert(map.find(clusterIds[i]) != map.end());
4437             const LayerParams& lp = map.find(clusterIds[i])->second.params;
4438             if (!lp.name.empty()) {
4439                 if (i > 0) {
4440                     out << " | ";
4441                 }
4442                 out << lp.name << "\\n" << lp.type << "\\n";  // align center
4443                 if (lp.has("kernel_size"))
4444                 {
4445                     string kernel = dumpLayerParameterSize("kernel_size", lp);
4446                     out << kernel;
4447                     out << "\\l";  // align left
4448                 } else if (lp.has("kernel_h") && lp.has("kernel_w")) {
4449                     DictValue h = lp.get("kernel_h");
4450                     DictValue w = lp.get("kernel_w");
4451                     out << "kernel (HxW): " << h << " x " << w;
4452                     out << "\\l";  // align left
4453                 }
4454                 if (lp.has("stride")) {
4455                     string stride = dumpLayerParameterSize("stride", lp);
4456                     out << stride;
4457                     out << "\\l";  // align left
4458                 } else if (lp.has("stride_h") && lp.has("stride_w")) {
4459                     DictValue h = lp.get("stride_h");
4460                     DictValue w = lp.get("stride_w");
4461                     out << "stride (HxW): " << h << " x " << w;
4462                     out << "\\l";  // align left
4463                 }
4464                 if (lp.has("dilation")) {
4465                     string dilation = dumpLayerParameterSize("dilation", lp);
4466                     out << dilation;
4467                     out << "\\l";  // align left
4468                 } else if (lp.has("dilation_h") && lp.has("dilation_w")) {
4469                     DictValue h = lp.get("dilation_h");
4470                     DictValue w = lp.get("dilation_w");
4471                     out << "dilation (HxW): " << h << " x " << w;
4472                     out << "\\l";  // align left
4473                 }
4474                 if (lp.has("pad")) {
4475                     DictValue pad = lp.get("pad");
4476                     out << "pad ";
4477                     switch (pad.size())
4478                     {
4479                         case 1: out << ": " << pad; break;
4480                         case 2:
4481                             out << "(HxW): (" << pad.get<int>(0) << " x " << pad.get<int>(1) << ")";
4482                             break;
4483                         case 4:
4484                             out << "(HxW): (" << pad.get<int>(0) << ", " << pad.get<int>(2)
4485                                 << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(3) << ")";
4486                             break;
4487                         case 6:
4488                             out << "(DxHxW): (" << pad.get<int>(0) << ", " << pad.get<int>(3)
4489                                 << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(4)
4490                                 << ") x (" << pad.get<int>(2) << ", " << pad.get<int>(5) << ")";
4491                             break;
4492                         default: CV_Error(Error::StsNotImplemented,  format("Unsupported pad size = %d", pad.size()));
4493                     }
4494                     out << "\\l";  // align left
4495                 } else if (lp.has("pad_l") && lp.has("pad_t") && lp.has("pad_r") && lp.has("pad_b")) {
4496                     DictValue l = lp.get("pad_l");
4497                     DictValue t = lp.get("pad_t");
4498                     DictValue r = lp.get("pad_r");
4499                     DictValue b = lp.get("pad_b");
4500                     out << "pad (HxW): (" << t << ", " << b << ") x (" << l << ", " << r << ")";
4501                     out << "\\l";  // align left
4502                 }
4503                 else if (lp.has("pooled_w") || lp.has("pooled_h")) {
4504                     DictValue h = lp.get("pooled_h");
4505                     DictValue w = lp.get("pooled_w");
4506                     out << "pad pooled (HxW): " << h << " x " << w;
4507                     out << "\\l";  // align left
4508                 }
4509                 if (lp.has("pool")) {
4510                     out << "pool: " << lp.get("pool");
4511                     out << "\\l";  // align left
4512                 }
4513                 if (lp.has("global_pooling")) {
4514                     out << "global_pooling: " << lp.get("global_pooling");
4515                     out << "\\l";  // align left
4516                 }
4517                 if (lp.has("group")) {
4518                     out << "group: " << lp.get("group");
4519                     out << "\\l";  // align left
4520                 }
4521             }
4522         }
4523         if (!ld.outputBlobs.empty())
4524         {
4525             out << "output: " << ld.outputBlobs[0].size;
4526             out << "\\l";  // align left
4527         }
4528
4529         Ptr<BackendNode> layerBackend;
4530         std::map<int, Ptr<BackendNode> >::const_iterator ibn = ld.backendNodes.find(prefBackend);
4531         if (ibn != ld.backendNodes.end())
4532             layerBackend = ibn->second;
4533         out << (!layerBackend.empty() ? backend : "OCV/");
4534         int colorId = 0;
4535         const Target target = ld.layerInstance.empty()
4536                          ? DNN_TARGET_CPU
4537                                  : (Target)(ld.layerInstance->preferableTarget);  // TODO fix preferableTarget type
4538         switch (target)
4539         {
4540             case DNN_TARGET_CPU: out << "CPU"; colorId = layerBackend.empty() ? 0 : 5; break;
4541             case DNN_TARGET_OPENCL: out << "OCL"; colorId = 1; break;
4542             case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16"; colorId = 2; break;
4543             case DNN_TARGET_MYRIAD: out << "MYRIAD"; colorId = 3; break;
4544             case DNN_TARGET_VULKAN: out << "VULKAN"; colorId = 7; break;
4545             case DNN_TARGET_FPGA: out << "FPGA"; colorId = 4; break;
4546             case DNN_TARGET_CUDA: out << "CUDA"; colorId = 5; break;
4547             case DNN_TARGET_CUDA_FP16: out << "CUDA_FP16"; colorId = 6; break;
4548             // don't use default:
4549         }
4550         out << "\\n";  // align center
4551         out << ((clusterIds.size() == 1)? "\" " : " }\" ");
4552         out << "fillcolor=\"" << colors[colorId] << "\" ";
4553         out << "style=filled ";
4554         out << "shape=" << ((clusterIds.size() == 1)? "box" : "record") << "]\n";
4555     }
4556     out << '\n';
4557     // Add edges
4558     int inputsSize = hasInput ? netInputLayer->outNames.size() : 0;
4559     for (std::map<int, LayerData>::const_iterator it = map.begin(); it != map.end(); ++it)
4560     {
4561         const LayerData& ld = it->second;
4562         if (allLayers[it->first] == -1)  // node
4563         {
4564             for (int i = 0; i < ld.consumers.size(); i++)
4565             {
4566                 int outId = ld.consumers[i].lid;
4567                 if (it == map.begin() && inputsSize > 1)
4568                     out << "\t\"" << ld.name << "_" << i << "\"" << " -> ";
4569                 else
4570                     out << "\t\"" << ld.name << "\"" << " -> ";
4571                 if (allLayers[outId] == -1)  // node
4572                 {
4573                     CV_DbgAssert(map.find(outId) != map.end());
4574                     out << "\"" << map.find(outId)->second.name << "\"\n";
4575                 }
4576                 else  // cluster
4577                 {
4578                     out << "\"" << "cluster_" << allLayers[outId] << "\"\n";
4579                 }
4580             }
4581         }
4582         else if (it->first == skippedLayers[allLayers[it->first]].back())  // edges from last layer in cluster
4583         {
4584             for (int i = 0; i < ld.consumers.size(); i++)
4585             {
4586                 int outId = ld.consumers[i].lid;
4587                 if (allLayers[outId] == -1) // node
4588                 {
4589                     CV_DbgAssert(map.find(outId) != map.end());
4590                     out << "\t\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
4591                     out << "\"" << map.find(outId)->second.name << "\"\n";
4592                 }
4593                 else if (allLayers[outId] != allLayers[it->first]) { // another cluster
4594                     out << "\t\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
4595                     out << "\"" << "cluster_" << allLayers[outId] << "\"\n";
4596                 }
4597             }
4598         }
4599     }
4600     out << "}\n";
4601     return out.str();
4602 }
4603
4604 void Net::dumpToFile(const String& path) {
4605     std::ofstream file(path.c_str());
4606     file << dump();
4607     file.close();
4608 }
4609
4610 Ptr<Layer> Net::getLayer(LayerId layerId)
4611 {
4612     LayerData &ld = impl->getLayerData(layerId);
4613     return ld.getLayerInstance();
4614 }
4615
4616 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
4617 {
4618     LayerData &ld = impl->getLayerData(layerId);
4619
4620     std::vector<Ptr<Layer> > inputLayers;
4621     inputLayers.reserve(ld.inputBlobsId.size());
4622     for (int i = 0; i < ld.inputBlobsId.size(); ++i) {
4623         inputLayers.push_back(getLayer(ld.inputBlobsId[i].lid));
4624     }
4625     return inputLayers;
4626 }
4627
4628 std::vector<String> Net::getLayerNames() const
4629 {
4630     CV_TRACE_FUNCTION();
4631
4632     std::vector<String> res;
4633     res.reserve(impl->layers.size());
4634
4635     Impl::MapIdToLayerData::iterator it;
4636     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
4637     {
4638         if (it->second.id) //skip Data layer
4639             res.push_back(it->second.name);
4640     }
4641
4642     return res;
4643 }
4644
4645 bool Net::empty() const
4646 {
4647     return impl->layers.size() <= 1; //first layer is default Data layer
4648 }
4649
4650 std::vector<int> Net::getUnconnectedOutLayers() const
4651 {
4652     std::vector<int> layersIds;
4653
4654     Impl::MapIdToLayerData::iterator it;
4655     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
4656     {
4657         int lid = it->first;
4658         LayerData &ld = it->second;
4659
4660         if (ld.requiredOutputs.size() == 0)
4661             layersIds.push_back(lid);
4662     }
4663
4664     return layersIds;
4665 }
4666
4667 std::vector<String> Net::getUnconnectedOutLayersNames() const
4668 {
4669     std::vector<int> ids = getUnconnectedOutLayers();
4670     const size_t n = ids.size();
4671     std::vector<String> names(n);
4672     for (size_t i = 0; i < n; ++i)
4673     {
4674         names[i] = impl->layers[ids[i]].name;
4675     }
4676     return names;
4677 }
4678
4679 void Net::getLayersShapes(const ShapesVec& netInputShapes,
4680                           std::vector<int>& layersIds,
4681                           std::vector<ShapesVec>& inLayersShapes,
4682                           std::vector<ShapesVec>& outLayersShapes) const
4683 {
4684     layersIds.clear();
4685     inLayersShapes.clear();
4686     outLayersShapes.clear();
4687
4688     Impl::LayersShapesMap inOutShapes;
4689     impl->getLayersShapes(netInputShapes, inOutShapes);
4690
4691     for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
4692         it != inOutShapes.end(); it++)
4693     {
4694         layersIds.push_back(it->first);
4695         inLayersShapes.push_back(it->second.in);
4696         outLayersShapes.push_back(it->second.out);
4697     }
4698 }
4699
4700 void Net::getLayersShapes(const MatShape& netInputShape,
4701                           std::vector<int>& layerIds,
4702                           std::vector<ShapesVec>& inLayersShapes,
4703                           std::vector<ShapesVec>& outLayersShapes) const
4704 {
4705     getLayersShapes(ShapesVec(1, netInputShape),
4706                     layerIds, inLayersShapes, outLayersShapes);
4707 }
4708
4709 void Net::getLayerShapes(const MatShape& netInputShape,
4710                          const int layerId,
4711                          ShapesVec& inLayerShapes,
4712                          ShapesVec& outLayerShapes) const
4713 {
4714     getLayerShapes(ShapesVec(1, netInputShape),
4715                    layerId, inLayerShapes, outLayerShapes);
4716
4717 }
4718
4719 void Net::getLayerShapes(const ShapesVec& netInputShapes,
4720                     const int layerId,
4721                     ShapesVec& inLayerShapes,
4722                     ShapesVec& outLayerShapes) const
4723 {
4724     LayerShapes shapes;
4725     impl->getLayerShapes(netInputShapes, layerId, shapes);
4726     inLayerShapes = shapes.in;
4727     outLayerShapes = shapes.out;
4728 }
4729
4730 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
4731 {
4732     CV_TRACE_FUNCTION();
4733
4734     int64 flops = 0;
4735     std::vector<int> ids;
4736     std::vector<std::vector<MatShape> > inShapes, outShapes;
4737     getLayersShapes(netInputShapes, ids, inShapes, outShapes);
4738     CV_Assert(inShapes.size() == outShapes.size());
4739     CV_Assert(inShapes.size() == ids.size());
4740
4741     for(int i = 0; i < ids.size(); i++)
4742     {
4743         flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
4744                                                                    outShapes[i]);
4745     }
4746
4747     return flops;
4748 }
4749
4750 int64 Net::getFLOPS(const MatShape& netInputShape) const
4751 {
4752     return getFLOPS(std::vector<MatShape>(1, netInputShape));
4753 }
4754
4755 int64 Net::getFLOPS(const int layerId,
4756               const std::vector<MatShape>& netInputShapes) const
4757 {
4758     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
4759     CV_Assert(layer != impl->layers.end());
4760
4761     LayerShapes shapes;
4762     impl->getLayerShapes(netInputShapes, layerId, shapes);
4763
4764     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
4765 }
4766
4767 int64 Net::getFLOPS(const int layerId,
4768               const MatShape& netInputShape) const
4769 {
4770     return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
4771 }
4772
4773 void Net::getLayerTypes(std::vector<String>& layersTypes) const
4774 {
4775     layersTypes.clear();
4776
4777     std::map<String, int> layers;
4778     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
4779          it != impl->layers.end(); it++)
4780     {
4781         if (layers.find(it->second.type) == layers.end())
4782             layers[it->second.type] = 0;
4783         layers[it->second.type]++;
4784     }
4785
4786     for (std::map<String, int>::iterator it = layers.begin();
4787          it != layers.end(); it++)
4788     {
4789         layersTypes.push_back(it->first);
4790     }
4791 }
4792
4793 int Net::getLayersCount(const String& layerType) const
4794 {
4795     int count = 0;
4796     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
4797          it != impl->layers.end(); it++)
4798     {
4799         if (it->second.type == layerType)
4800             count++;
4801     }
4802     return count;
4803 }
4804
4805 void Net::getMemoryConsumption(const int layerId,
4806                                const std::vector<MatShape>& netInputShapes,
4807                                size_t& weights, size_t& blobs) const
4808 {
4809     CV_TRACE_FUNCTION();
4810
4811     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
4812     CV_Assert(layer != impl->layers.end());
4813
4814     weights = blobs = 0;
4815
4816     for(int i = 0; i < layer->second.params.blobs.size(); i++)
4817     {
4818         const Mat& weightsBlob = layer->second.params.blobs[i];
4819         weights += weightsBlob.total()*weightsBlob.elemSize();
4820     }
4821
4822     ShapesVec inLayerShapes, outLayerShapes;
4823     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
4824     for(int i = 0; i < outLayerShapes.size(); i++)
4825     {
4826         blobs += total(outLayerShapes[i]) * sizeof(float);
4827     }
4828 }
4829
4830 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
4831                                size_t& weights, size_t& blobs) const
4832 {
4833     CV_TRACE_FUNCTION();
4834
4835     std::vector<int> layerIds;
4836     std::vector<size_t> w, b;
4837     getMemoryConsumption(netInputShapes, layerIds, w, b);
4838
4839     weights = blobs = 0;
4840     for(int i = 0; i < layerIds.size(); i++)
4841     {
4842         weights += w[i];
4843         blobs += b[i];
4844     }
4845 }
4846
4847 void Net::getMemoryConsumption(const int layerId,
4848                                const MatShape& netInputShape,
4849                                size_t& weights, size_t& blobs) const
4850 {
4851     getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
4852                          weights, blobs);
4853 }
4854
4855 void Net::getMemoryConsumption(const MatShape& netInputShape,
4856                                size_t& weights, size_t& blobs) const
4857 {
4858     getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
4859                          weights, blobs);
4860 }
4861
4862 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
4863                                   std::vector<int>& layerIds, std::vector<size_t>& weights,
4864                                   std::vector<size_t>& blobs) const
4865 {
4866     CV_TRACE_FUNCTION();
4867
4868     layerIds.clear();
4869     weights.clear();
4870     blobs.clear();
4871
4872     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
4873
4874     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
4875
4876     for(int i = 0; i < layerIds.size(); i++)
4877     {
4878         int w = 0, b = 0;
4879         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
4880         CV_Assert(layer != impl->layers.end());
4881
4882         for(int j = 0; j < layer->second.params.blobs.size(); j++)
4883         {
4884             const Mat& weightsBlob = layer->second.params.blobs[j];
4885             w += weightsBlob.total()*weightsBlob.elemSize();
4886         }
4887
4888         for(int j = 0; j < outLayerShapes[i].size(); j++)
4889         {
4890             b += total(outLayerShapes[i][j]) * sizeof(float);
4891         }
4892
4893         weights.push_back(w);
4894         blobs.push_back(b);
4895     }
4896 }
4897
4898 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
4899                                std::vector<size_t>& weights, std::vector<size_t>& blobs) const
4900 {
4901     getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
4902                          weights, blobs);
4903 }
4904
4905 void Net::enableFusion(bool fusion)
4906 {
4907     if( impl->fusion != fusion )
4908     {
4909         impl->fusion = fusion;
4910         impl->netWasAllocated = false;
4911         impl->clear();
4912     }
4913 }
4914
4915 void Net::setHalideScheduler(const String& scheduler)
4916 {
4917     CV_TRACE_FUNCTION();
4918     CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
4919
4920     impl->halideConfigFile = scheduler;
4921 }
4922
4923 int64 Net::getPerfProfile(std::vector<double>& timings)
4924 {
4925     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
4926     int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
4927     return total;
4928 }
4929
4930 //////////////////////////////////////////////////////////////////////////
4931
4932 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
4933
4934 Layer::Layer(const LayerParams &params)
4935     : blobs(params.blobs), name(params.name), type(params.type)
4936 {
4937     preferableTarget = DNN_TARGET_CPU;
4938 }
4939
4940 void Layer::setParamsFrom(const LayerParams &params)
4941 {
4942     blobs = params.blobs;
4943     name = params.name;
4944     type = params.type;
4945 }
4946
4947 int Layer::inputNameToIndex(String)
4948 {
4949     return -1;
4950 }
4951
4952 int Layer::outputNameToIndex(const String&)
4953 {
4954     return 0;
4955 }
4956
4957 bool Layer::supportBackend(int backendId)
4958 {
4959     return backendId == DNN_BACKEND_OPENCV;
4960 }
4961
4962 Ptr<BackendNode> Layer::initCUDA(
4963     void*,
4964     const std::vector<Ptr<BackendWrapper>>&,
4965     const std::vector<Ptr<BackendWrapper>>&)
4966 {
4967     CV_Error(Error::StsNotImplemented, "CUDA pipeline of " + type +
4968                                        " layers is not defined.");
4969     return Ptr<BackendNode>();
4970 }
4971
4972 Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
4973 {
4974     CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
4975                                        " layers is not defined.");
4976     return Ptr<BackendNode>();
4977 }
4978
4979 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
4980 {
4981     CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
4982                                        " layers is not defined.");
4983     return Ptr<BackendNode>();
4984 }
4985
4986 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
4987 {
4988     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
4989                                        " layers is not defined.");
4990     return Ptr<BackendNode>();
4991 }
4992
4993 Ptr<BackendNode> Layer::initNgraph(const std::vector<Ptr<BackendWrapper> > & inputs, const std::vector<Ptr<BackendNode> >& nodes)
4994 {
4995     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
4996                                        " layers is not defined.");
4997     return Ptr<BackendNode>();
4998 }
4999
5000 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
5001                                  const std::vector<Mat> &outputs, int targetId) const
5002 {
5003 #ifdef  HAVE_HALIDE
5004     CV_TRACE_FUNCTION();
5005
5006     Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
5007                 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
5008     Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
5009
5010     int outW, outH, outC, outN;
5011     getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
5012
5013     if (targetId == DNN_TARGET_CPU)
5014     {
5015         if (outW == 1 && outH == 1)
5016         {
5017             if (outC + outN == 1)
5018                 return;
5019
5020             if (outC > 8)
5021               top.split(c, co, ci, 8)
5022                  .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
5023                  .parallel(tile)
5024                  .vectorize(ci, 8);
5025             else
5026               top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
5027                  .parallel(tile);
5028         }
5029         else
5030         {
5031             if (outH > 2)
5032             {
5033                 top.reorder(x, c, y)
5034                    .split(y, yo, yi, 2)
5035                    .fuse(yo, n, tile)
5036                    .parallel(tile)
5037                    .unroll(yi)
5038                    .vectorize(x, outW >= 16 ? 16 : outW);
5039             }
5040         }
5041     }
5042     else if (targetId == DNN_TARGET_OPENCL)
5043     {
5044         if (outW == 1 && outH == 1)
5045         {
5046             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
5047             top.split(c, co, ci, c_split)
5048                .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
5049                .gpu_blocks(tile)
5050                .gpu_threads(ci);
5051         }
5052         else
5053         {
5054             int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
5055             int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
5056             // Supported vectorization widths: 2, 3, 4, 8, 16
5057             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
5058             top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
5059                .split(c, co, ci, c_split)
5060                .gpu_blocks(xo, yo, co)
5061                .gpu_threads(xi, yi)
5062                .reorder(xi, yi, ci, xo, yo, co)
5063                .vectorize(ci);
5064         }
5065     }
5066     else
5067         CV_Error(Error::StsNotImplemented, "Unknown target identifier");
5068 #endif  // HAVE_HALIDE
5069 }
5070
5071 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
5072 {
5073     return Ptr<BackendNode>();
5074 }
5075
5076 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
5077 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
5078 void Layer::getScaleShift(Mat& scale, Mat& shift) const
5079 {
5080     scale = Mat();
5081     shift = Mat();
5082 }
5083
5084 void Layer::unsetAttached()
5085 {
5086     setActivation(Ptr<ActivationLayer>());
5087 }
5088
5089 template <typename T>
5090 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
5091 {
5092     pv.resize(v.size());
5093     for (size_t i = 0; i < v.size(); i++)
5094         pv[i] = const_cast<T*>(&v[i]);
5095 }
5096
5097 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
5098 {
5099     CV_TRACE_FUNCTION();
5100     this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
5101 }
5102
5103 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
5104 {
5105     CV_UNUSED(input);CV_UNUSED(output);
5106 }
5107
5108 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
5109 {
5110     CV_TRACE_FUNCTION();
5111     std::vector<Mat> inputs, outputs;
5112     inputs_arr.getMatVector(inputs);
5113     outputs_arr.getMatVector(outputs);
5114
5115     std::vector<Mat*> inputsp;
5116     vecToPVec(inputs, inputsp);
5117     this->finalize(inputsp, outputs);
5118 }
5119
5120 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
5121 {
5122     CV_TRACE_FUNCTION();
5123
5124     std::vector<Mat> outputs;
5125     this->finalize(inputs, outputs);
5126     return outputs;
5127 }
5128
5129 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
5130 {
5131     // We kept this method for compatibility. DNN calls it now only to support users' implementations.
5132 }
5133
5134 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
5135 {
5136     CV_TRACE_FUNCTION();
5137     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
5138
5139     Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
5140 }
5141
5142 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
5143 {
5144     CV_TRACE_FUNCTION();
5145     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
5146
5147     if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
5148     {
5149         std::vector<UMat> inputs;
5150         std::vector<UMat> outputs;
5151         std::vector<UMat> internals;
5152
5153         std::vector<UMat> orig_inputs;
5154         std::vector<UMat> orig_outputs;
5155         std::vector<UMat> orig_internals;
5156
5157         inputs_arr.getUMatVector(orig_inputs);
5158         outputs_arr.getUMatVector(orig_outputs);
5159         internals_arr.getUMatVector(orig_internals);
5160
5161         inputs.resize(orig_inputs.size());
5162         for (size_t i = 0; i < orig_inputs.size(); i++)
5163             convertFp16(orig_inputs[i], inputs[i]);
5164
5165         outputs.resize(orig_outputs.size());
5166         for (size_t i = 0; i < orig_outputs.size(); i++)
5167             outputs[i].create(shape(orig_outputs[i]), CV_32F);
5168
5169         internals.resize(orig_internals.size());
5170         for (size_t i = 0; i < orig_internals.size(); i++)
5171             internals[i].create(shape(orig_internals[i]), CV_32F);
5172
5173         forward(inputs, outputs, internals);
5174
5175         for (size_t i = 0; i < outputs.size(); i++)
5176             convertFp16(outputs[i], orig_outputs[i]);
5177
5178         // sync results back
5179         outputs_arr.assign(orig_outputs);
5180         internals_arr.assign(orig_internals);
5181         return;
5182     }
5183     std::vector<Mat> inpvec;
5184     std::vector<Mat> outputs;
5185     std::vector<Mat> internals;
5186
5187     inputs_arr.getMatVector(inpvec);
5188     outputs_arr.getMatVector(outputs);
5189     internals_arr.getMatVector(internals);
5190
5191     std::vector<Mat*> inputs(inpvec.size());
5192     for (int i = 0; i < inpvec.size(); i++)
5193         inputs[i] = &inpvec[i];
5194
5195     this->forward(inputs, outputs, internals);
5196
5197     // sync results back
5198     outputs_arr.assign(outputs);
5199     internals_arr.assign(internals);
5200 }
5201
5202 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
5203 {
5204     CV_TRACE_FUNCTION();
5205
5206     this->finalize(inputs, outputs);
5207     this->forward(inputs, outputs, internals);
5208 }
5209
5210 Layer::~Layer() {}
5211
5212 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
5213                             const int requiredOutputs,
5214                             std::vector<MatShape> &outputs,
5215                             std::vector<MatShape> &internals) const
5216 {
5217     CV_Assert(inputs.size());
5218     outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
5219     return false;
5220 }
5221
5222 //////////////////////////////////////////////////////////////////////////
5223
5224 static Mutex& getLayerFactoryMutex()
5225 {
5226     static Mutex* volatile instance = NULL;
5227     if (instance == NULL)
5228     {
5229         cv::AutoLock lock(getInitializationMutex());
5230         if (instance == NULL)
5231             instance = new Mutex();
5232     }
5233     return *instance;
5234 }
5235
5236 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
5237
5238 static LayerFactory_Impl& getLayerFactoryImpl_()
5239 {
5240     static LayerFactory_Impl impl;
5241     return impl;
5242 }
5243
5244 static LayerFactory_Impl& getLayerFactoryImpl()
5245 {
5246     static LayerFactory_Impl* volatile instance = NULL;
5247     if (instance == NULL)
5248     {
5249         cv::AutoLock lock(getLayerFactoryMutex());
5250         if (instance == NULL)
5251         {
5252             instance = &getLayerFactoryImpl_();
5253             initializeLayerFactory();
5254         }
5255     }
5256     return *instance;
5257 }
5258
5259 void LayerFactory::registerLayer(const String &type, Constructor constructor)
5260 {
5261     CV_TRACE_FUNCTION();
5262     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
5263
5264     cv::AutoLock lock(getLayerFactoryMutex());
5265     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type);
5266
5267     if (it != getLayerFactoryImpl().end())
5268     {
5269         if (it->second.back() == constructor)
5270             CV_Error(cv::Error::StsBadArg, "Layer \"" + type + "\" already was registered");
5271         it->second.push_back(constructor);
5272     }
5273     getLayerFactoryImpl().insert(std::make_pair(type, std::vector<Constructor>(1, constructor)));
5274 }
5275
5276 void LayerFactory::unregisterLayer(const String &type)
5277 {
5278     CV_TRACE_FUNCTION();
5279     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
5280
5281     cv::AutoLock lock(getLayerFactoryMutex());
5282
5283     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type);
5284     if (it != getLayerFactoryImpl().end())
5285     {
5286         if (it->second.size() > 1)
5287             it->second.pop_back();
5288         else
5289             getLayerFactoryImpl().erase(it);
5290     }
5291 }
5292
5293 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
5294 {
5295     CV_TRACE_FUNCTION();
5296     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
5297
5298     cv::AutoLock lock(getLayerFactoryMutex());
5299     LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type);
5300
5301     if (it != getLayerFactoryImpl().end())
5302     {
5303         CV_Assert(!it->second.empty());
5304         return it->second.back()(params);
5305     }
5306     else
5307     {
5308         return Ptr<Layer>(); //NULL
5309     }
5310 }
5311
5312 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
5313
5314 BackendNode::~BackendNode() {};
5315
5316 BackendWrapper::BackendWrapper(int backendId, int targetId)
5317     : backendId(backendId), targetId(targetId) {}
5318
5319 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
5320 {
5321     CV_Error(Error::StsNotImplemented,
5322              "Constructor of backend wrapper must be implemented");
5323 }
5324
5325 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
5326 {
5327     CV_Error(Error::StsNotImplemented,
5328              "Constructor of backend wrapper must be implemented");
5329 }
5330
5331 BackendWrapper::~BackendWrapper() {}
5332
5333 Net readNet(const String& _model, const String& _config, const String& _framework)
5334 {
5335     String framework = toLowerCase(_framework);
5336     String model = _model;
5337     String config = _config;
5338     const std::string modelExt = model.substr(model.rfind('.') + 1);
5339     const std::string configExt = config.substr(config.rfind('.') + 1);
5340     if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
5341                                 modelExt == "prototxt" || configExt == "prototxt")
5342     {
5343         if (modelExt == "prototxt" || configExt == "caffemodel")
5344             std::swap(model, config);
5345         return readNetFromCaffe(config, model);
5346     }
5347     if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
5348                                      modelExt == "pbtxt" || configExt == "pbtxt")
5349     {
5350         if (modelExt == "pbtxt" || configExt == "pb")
5351             std::swap(model, config);
5352         return readNetFromTensorflow(model, config);
5353     }
5354     if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
5355                                 configExt == "t7" || configExt == "net")
5356     {
5357         return readNetFromTorch(model.empty() ? config : model);
5358     }
5359     if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
5360                                   modelExt == "cfg" || configExt == "cfg")
5361     {
5362         if (modelExt == "cfg" || configExt == "weights")
5363             std::swap(model, config);
5364         return readNetFromDarknet(config, model);
5365     }
5366     if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
5367                                modelExt == "xml" || configExt == "xml")
5368     {
5369         if (modelExt == "xml" || configExt == "bin")
5370             std::swap(model, config);
5371         return readNetFromModelOptimizer(config, model);
5372     }
5373     if (framework == "onnx" || modelExt == "onnx")
5374     {
5375         return readNetFromONNX(model);
5376     }
5377     CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
5378                                       model + (config.empty() ? "" : ", " + config));
5379 }
5380
5381 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
5382             const std::vector<uchar>& bufferConfig)
5383 {
5384     String framework = toLowerCase(_framework);
5385     if (framework == "caffe")
5386         return readNetFromCaffe(bufferConfig, bufferModel);
5387     else if (framework == "tensorflow")
5388         return readNetFromTensorflow(bufferModel, bufferConfig);
5389     else if (framework == "darknet")
5390         return readNetFromDarknet(bufferConfig, bufferModel);
5391     else if (framework == "torch")
5392         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
5393     else if (framework == "dldt")
5394         return readNetFromModelOptimizer(bufferConfig, bufferModel);
5395     CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
5396 }
5397
5398 Net readNetFromModelOptimizer(const String &xml, const String &bin)
5399 {
5400     return Net::readFromModelOptimizer(xml, bin);
5401 }
5402
5403 Net readNetFromModelOptimizer(const std::vector<uchar>& bufferCfg, const std::vector<uchar>& bufferModel)
5404 {
5405     return Net::readFromModelOptimizer(bufferCfg, bufferModel);
5406 }
5407
5408 Net readNetFromModelOptimizer(
5409         const uchar* bufferModelConfigPtr, size_t bufferModelConfigSize,
5410         const uchar* bufferWeightsPtr, size_t bufferWeightsSize
5411 )
5412 {
5413     return Net::readFromModelOptimizer(
5414         bufferModelConfigPtr, bufferModelConfigSize,
5415         bufferWeightsPtr, bufferWeightsSize
5416     );
5417 }
5418
5419 CV__DNN_INLINE_NS_END
5420 }} // namespace