Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
[platform/upstream/opencv.git] / modules / dnn / src / dnn.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include "op_halide.hpp"
44 #include "op_inf_engine.hpp"
45 #include "op_vkcom.hpp"
46 #include "op_cuda.hpp"
47 #include "halide_scheduler.hpp"
48
49 #include <set>
50 #include <algorithm>
51 #include <iostream>
52 #include <sstream>
53 #include <fstream>
54 #include <iterator>
55 #include <numeric>
56 #include <memory>
57 #include <opencv2/dnn/shape_utils.hpp>
58 #include <opencv2/imgproc.hpp>
59
60 #include <opencv2/core/utils/configuration.private.hpp>
61 #include <opencv2/core/utils/logger.hpp>
62
63 #include <opencv2/core/cuda.hpp>
64
65 namespace cv {
66 namespace dnn {
67 CV__DNN_INLINE_NS_BEGIN
68
69 // this option is useful to run valgrind memory errors detection
70 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
71
72 #ifdef HAVE_OPENCL
73 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
74 #endif
75
76 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
77 #ifdef HAVE_INF_ENGINE
78     (size_t)DNN_BACKEND_INFERENCE_ENGINE
79 #else
80     (size_t)DNN_BACKEND_OPENCV
81 #endif
82 );
83
84 // Additional checks (slowdowns execution!)
85 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
86 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
87 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
88
89 using std::vector;
90 using std::map;
91 using std::make_pair;
92 using std::set;
93
94 //==================================================================================================
95
96 class BackendRegistry
97 {
98 public:
99     typedef std::vector< std::pair<Backend, Target> > BackendsList;
100     const BackendsList & getBackends() const { return backends; }
101     static BackendRegistry & getRegistry()
102     {
103         static BackendRegistry impl;
104         return impl;
105     }
106 private:
107     BackendRegistry()
108     {
109 #ifdef HAVE_HALIDE
110         backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_CPU));
111 #  ifdef HAVE_OPENCL
112         if (cv::ocl::useOpenCL())
113             backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL));
114 #  endif
115 #endif // HAVE_HALIDE
116
117 #ifdef HAVE_INF_ENGINE
118         if (checkIETarget(DNN_TARGET_CPU))
119             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU));
120         if (checkIETarget(DNN_TARGET_MYRIAD))
121             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
122         if (checkIETarget(DNN_TARGET_FPGA))
123             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_FPGA));
124 #  ifdef HAVE_OPENCL
125         if (cv::ocl::useOpenCL() && ocl::Device::getDefault().isIntel())
126         {
127             if (checkIETarget(DNN_TARGET_OPENCL))
128                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL));
129             if (checkIETarget(DNN_TARGET_OPENCL_FP16))
130                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
131         }
132 #  endif
133 #endif // HAVE_INF_ENGINE
134
135 #ifdef HAVE_OPENCL
136         if (cv::ocl::useOpenCL())
137         {
138             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL));
139             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16));
140         }
141 #endif
142
143         backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
144
145 #ifdef HAVE_VULKAN
146         if (haveVulkan())
147             backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
148 #endif
149
150 #ifdef HAVE_CUDA
151         if (haveCUDA()) {
152             backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
153             backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
154         }
155 #endif
156     }
157     static inline bool checkIETarget(int target)
158     {
159 #ifndef HAVE_INF_ENGINE
160         return false;
161 #else
162         cv::dnn::Net net;
163         cv::dnn::LayerParams lp;
164         lp.set("kernel_size", 1);
165         lp.set("num_output", 1);
166         lp.set("bias_term", false);
167         lp.type = "Convolution";
168         lp.name = "testLayer";
169         lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
170         net.addLayerToPrev(lp.name, lp.type, lp);
171         net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
172         net.setPreferableTarget(target);
173         static int inpDims[] = {1, 2, 3, 4};
174         net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
175         try
176         {
177             net.forward();
178         }
179         catch(...)
180         {
181             return false;
182         }
183         return true;
184 #endif
185     }
186
187     BackendsList backends;
188 };
189
190
191 std::vector< std::pair<Backend, Target> > getAvailableBackends()
192 {
193     return BackendRegistry::getRegistry().getBackends();
194 }
195
196 std::vector<Target> getAvailableTargets(Backend be)
197 {
198     if (be == DNN_BACKEND_DEFAULT)
199         be = (Backend)PARAM_DNN_BACKEND_DEFAULT;
200
201     std::vector<Target> result;
202     const BackendRegistry::BackendsList all_backends = getAvailableBackends();
203     for(BackendRegistry::BackendsList::const_iterator i = all_backends.begin(); i != all_backends.end(); ++i )
204     {
205         if (i->first == be)
206             result.push_back(i->second);
207     }
208     return result;
209 }
210
211 //==================================================================================================
212
213 namespace
214 {
215     typedef std::vector<MatShape> ShapesVec;
216
217     struct LayerShapes
218     {
219         ShapesVec in, out, internal;
220         // No guarantees that layer which support in-place computations
221         // will be computed in-place (input.data_ptr == output.data_ptr).
222         // If layer said that it could work in-place and layers after it
223         // no longer use input blob, we'll set output = input.
224         bool supportInPlace;
225         LayerShapes() {supportInPlace = false;}
226     };
227 }
228
229 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
230                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
231 {
232     CV_TRACE_FUNCTION();
233     Mat blob;
234     blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
235     return blob;
236 }
237
238 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
239                    const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
240 {
241     CV_TRACE_FUNCTION();
242     std::vector<Mat> images(1, image.getMat());
243     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
244 }
245
246 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
247                    const Scalar& mean, bool swapRB, bool crop, int ddepth)
248 {
249     CV_TRACE_FUNCTION();
250     Mat blob;
251     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
252     return blob;
253 }
254
255 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
256                     Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
257 {
258     CV_TRACE_FUNCTION();
259     CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
260     if (ddepth == CV_8U)
261     {
262         CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
263         CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
264     }
265
266     std::vector<Mat> images;
267     images_.getMatVector(images);
268     CV_Assert(!images.empty());
269     for (size_t i = 0; i < images.size(); i++)
270     {
271         Size imgSize = images[i].size();
272         if (size == Size())
273             size = imgSize;
274         if (size != imgSize)
275         {
276             if(crop)
277             {
278               float resizeFactor = std::max(size.width / (float)imgSize.width,
279                                             size.height / (float)imgSize.height);
280               resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
281               Rect crop(Point(0.5 * (images[i].cols - size.width),
282                               0.5 * (images[i].rows - size.height)),
283                         size);
284               images[i] = images[i](crop);
285             }
286             else
287               resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
288         }
289         if(images[i].depth() == CV_8U && ddepth == CV_32F)
290             images[i].convertTo(images[i], CV_32F);
291         Scalar mean = mean_;
292         if (swapRB)
293             std::swap(mean[0], mean[2]);
294
295         images[i] -= mean;
296         images[i] *= scalefactor;
297     }
298
299     size_t nimages = images.size();
300     Mat image0 = images[0];
301     int nch = image0.channels();
302     CV_Assert(image0.dims == 2);
303     if (nch == 3 || nch == 4)
304     {
305         int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
306         blob_.create(4, sz, ddepth);
307         Mat blob = blob_.getMat();
308         Mat ch[4];
309
310         for(size_t i = 0; i < nimages; i++ )
311         {
312             const Mat& image = images[i];
313             CV_Assert(image.depth() == blob_.depth());
314             nch = image.channels();
315             CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
316             CV_Assert(image.size() == image0.size());
317
318             for( int j = 0; j < nch; j++ )
319                 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
320             if(swapRB)
321                 std::swap(ch[0], ch[2]);
322             split(image, ch);
323         }
324     }
325     else
326     {
327        CV_Assert(nch == 1);
328        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
329        blob_.create(4, sz, ddepth);
330        Mat blob = blob_.getMat();
331
332        for(size_t i = 0; i < nimages; i++ )
333        {
334            const Mat& image = images[i];
335            CV_Assert(image.depth() == blob_.depth());
336            nch = image.channels();
337            CV_Assert(image.dims == 2 && (nch == 1));
338            CV_Assert(image.size() == image0.size());
339
340            image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
341        }
342     }
343 }
344
345 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
346 {
347     CV_TRACE_FUNCTION();
348
349     //A blob is a 4 dimensional matrix in floating point precision
350     //blob_[0] = batchSize = nbOfImages
351     //blob_[1] = nbOfChannels
352     //blob_[2] = height
353     //blob_[3] = width
354     CV_Assert(blob_.depth() == CV_32F);
355     CV_Assert(blob_.dims == 4);
356
357     images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
358
359     std::vector<Mat> vectorOfChannels(blob_.size[1]);
360     for (int n = 0; n <  blob_.size[0]; ++n)
361     {
362         for (int c = 0; c < blob_.size[1]; ++c)
363         {
364             vectorOfChannels[c] = getPlane(blob_, n, c);
365         }
366         cv::merge(vectorOfChannels, images_.getMatRef(n));
367     }
368 }
369
370 class OpenCLBackendWrapper : public BackendWrapper
371 {
372 public:
373     OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
374     {
375         m.copyTo(umat);
376         host = &m;
377         hostDirty = false;
378     }
379
380     OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
381         : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
382     {
383         Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
384         CV_Assert(!base.empty());
385
386         host = &m;
387
388         int shape[] = {1, (int)base->umat.total()};
389         umat = base->umat.reshape(1, 2, &shape[0])
390                          .colRange(0, host->total())
391                          .reshape(1, host->dims, &host->size[0]);
392         hostDirty = false;
393     }
394
395     static Ptr<BackendWrapper> create(Mat& m)
396     {
397         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
398     }
399
400     static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
401     {
402         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
403     }
404
405     static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
406     {
407         const int numWrappers = wrappers.size();
408         std::vector<UMat> mats(wrappers.size());
409         for (int i = 0; i < numWrappers; ++i)
410         {
411             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
412             CV_Assert(!umatWrapper.empty());
413             umatWrapper->copyToDevice();
414             mats[i] = umatWrapper->umat;
415         }
416         return mats;
417     }
418
419     // Replaces all umats in wrappers to specific ones.
420     static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
421                        const std::vector<UMat>& umats)
422     {
423         CV_Assert(wrappers.size() == umats.size());
424         for (int i = 0, n = umats.size(); i < n; ++i)
425         {
426             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
427             CV_Assert(!umatWrapper.empty());
428             umatWrapper->umat = umats[i];
429         }
430     }
431
432     ~OpenCLBackendWrapper() {}
433
434     // Copies data from device to a host memory.
435     virtual void copyToHost() CV_OVERRIDE
436     {
437         umat.copyTo(*host);
438     }
439
440     virtual void setHostDirty() CV_OVERRIDE
441     {
442         hostDirty = true;
443     };
444
445     void copyToDevice()
446     {
447         if (hostDirty)
448         {
449             host->copyTo(umat);
450             hostDirty = false;
451         }
452     }
453
454 private:
455     UMat umat;
456     Mat* host;
457     bool hostDirty;
458 };
459
460 struct LayerPin
461 {
462     int lid;
463     int oid;
464
465     LayerPin(int layerId = -1, int outputId = -1)
466         : lid(layerId), oid(outputId) {}
467
468     bool valid() const
469     {
470         return (lid >= 0 && oid >= 0);
471     }
472
473     bool equal(const LayerPin &r) const
474     {
475         return (lid == r.lid && oid == r.oid);
476     }
477
478     bool operator<(const LayerPin &r) const
479     {
480         return lid < r.lid || (lid == r.lid && oid < r.oid);
481     }
482
483     bool operator ==(const LayerPin &r) const
484     {
485         return lid == r.lid && oid == r.oid;
486     }
487 };
488
489 struct LayerData
490 {
491     LayerData() : id(-1), skip(false), flag(0) {}
492     LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
493         : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
494     {
495         CV_TRACE_FUNCTION();
496
497         //add logging info
498         params.name = name;
499         params.type = type;
500     }
501
502     int id;
503     String name;
504     String type;
505     LayerParams params;
506
507     std::vector<LayerPin> inputBlobsId;
508     std::set<int> inputLayersId;
509     std::set<int> requiredOutputs;
510     std::vector<LayerPin> consumers;
511     std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
512     std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
513     std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
514
515     Ptr<Layer> layerInstance;
516     std::vector<Mat> outputBlobs;
517     std::vector<Mat*> inputBlobs;
518     std::vector<Mat> internals;
519     // Computation nodes of implemented backends (except DEFAULT).
520     std::map<int, Ptr<BackendNode> > backendNodes;
521     // Flag for skip layer computation for specific backend.
522     bool skip;
523
524     int flag;
525
526     Ptr<Layer> getLayerInstance()
527     {
528         CV_TRACE_FUNCTION();
529         CV_TRACE_ARG_VALUE(type, "type", type.c_str());
530
531         if (layerInstance)
532             return layerInstance;
533
534         layerInstance = LayerFactory::createLayerInstance(type, params);
535         if (!layerInstance)
536         {
537             CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
538         }
539
540         return layerInstance;
541     }
542 };
543
544 //fake layer containing network input blobs
545 struct DataLayer : public Layer
546 {
547     DataLayer() : Layer()
548     {
549         skip = false;
550     }
551
552     virtual bool supportBackend(int backendId) CV_OVERRIDE
553     {
554         return backendId == DNN_BACKEND_OPENCV ||
555                (backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1);
556     }
557
558     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
559     {
560         CV_TRACE_FUNCTION();
561         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
562
563         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
564                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
565
566         if (outputs_arr.depth() == CV_16S)
567         {
568             forward_fallback(inputs_arr, outputs_arr, internals_arr);
569             return;
570         }
571
572         std::vector<Mat> outputs, internals;
573         outputs_arr.getMatVector(outputs);
574         internals_arr.getMatVector(internals);
575
576         // Supported modes:
577         // | Input type | Output type |
578         // |       fp32 |        fp32 |
579         // |      uint8 |        fp32 |
580         for (int i = 0; i < inputsData.size(); ++i)
581         {
582             double scale = scaleFactors[i];
583             Scalar& mean = means[i];
584             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
585             CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
586
587             bool singleMean = true;
588             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
589             {
590                 singleMean = mean[j] == mean[j - 1];
591             }
592
593             if (singleMean)
594             {
595                 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
596             }
597             else
598             {
599                 for (int n = 0; n < inputsData[i].size[0]; ++n)
600                     for (int c = 0; c < inputsData[i].size[1]; ++c)
601                     {
602                         Mat inp = getPlane(inputsData[i], n, c);
603                         Mat out = getPlane(outputs[i], n, c);
604                         inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
605                     }
606             }
607         }
608     }
609
610 #ifdef HAVE_OPENCL
611     std::vector<Mat> tmp_expressions;
612     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
613     {
614         // Supported modes:
615         // | Input type | Output type |
616         // |       fp32 |        fp32 |
617         // |       fp32 |        fp16 |
618         // |      uint8 |        fp32 |
619         std::vector<UMat> outputs;
620         outputs_.getUMatVector(outputs);
621
622         tmp_expressions.clear();
623         for (int i = 0; i < inputsData.size(); ++i)
624         {
625             Mat inputData = inputsData[i];
626
627             double scale = scaleFactors[i];
628             Scalar& mean = means[i];
629
630             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
631             bool singleMean = true;
632             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
633             {
634                 singleMean = mean[j] == mean[j - 1];
635             }
636
637             if (outputs_.depth() == CV_16S)
638             {
639                 if (singleMean)
640                 {
641                     tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
642                     convertFp16(tmp_expressions.back(), outputs[i]);
643                 }
644                 else
645                 {
646                     for (int n = 0; n < inputsData[i].size[0]; ++n)
647                         for (int c = 0; c < inputsData[i].size[1]; ++c)
648                         {
649                             Mat inp = getPlane(inputsData[i], n, c);
650
651                             std::vector<cv::Range> plane(4, Range::all());
652                             plane[0] = Range(n, n + 1);
653                             plane[1] = Range(c, c + 1);
654                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
655
656                             tmp_expressions.push_back(scale * (inp - mean[c]));
657                             convertFp16(tmp_expressions.back(), out);
658                         }
659                 }
660             }
661             else
662             {
663                 CV_Assert(outputs_.depth() == CV_32F);
664                 if (singleMean)
665                 {
666                     inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
667                 }
668                 else
669                 {
670                     for (int n = 0; n < inputsData[i].size[0]; ++n)
671                         for (int c = 0; c < inputsData[i].size[1]; ++c)
672                         {
673                             Mat inp = getPlane(inputsData[i], n, c);
674
675                             std::vector<cv::Range> plane(4, Range::all());
676                             plane[0] = Range(n, n + 1);
677                             plane[1] = Range(c, c + 1);
678                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
679
680                             inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
681                         }
682                 }
683             }
684         }
685         return true;
686     }
687 #endif
688
689     int outputNameToIndex(const String& tgtName) CV_OVERRIDE
690     {
691         int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
692         return (idx < (int)outNames.size()) ? idx : -1;
693     }
694
695     void setNames(const std::vector<String> &names)
696     {
697         outNames.assign(names.begin(), names.end());
698     }
699
700     bool getMemoryShapes(const std::vector<MatShape> &inputs,
701                          const int requiredOutputs,
702                          std::vector<MatShape> &outputs,
703                          std::vector<MatShape> &internals) const CV_OVERRIDE
704     {
705         CV_Assert(inputs.size() == requiredOutputs);
706         outputs.assign(inputs.begin(), inputs.end());
707         return false;
708     }
709
710     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
711     {
712         std::vector<Mat> outputs;
713         outputs_arr.getMatVector(outputs);
714
715         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
716                   inputsData.size() == outputs.size());
717         skip = true;
718         for (int i = 0; skip && i < inputsData.size(); ++i)
719         {
720             if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
721                 skip = false;
722         }
723     }
724
725     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
726     {
727 #ifdef HAVE_INF_ENGINE
728         CV_CheckEQ(inputsData.size(), (size_t)1, "");
729         CV_CheckEQ(inputsData[0].dims, 4, "");
730         const size_t numChannels = inputsData[0].size[1];
731         CV_Assert(numChannels <= 4);
732
733         // Scale
734         InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
735                                        InferenceEngine::Layout::C);
736         auto weights = InferenceEngine::make_shared_blob<float>(td);
737         weights->allocate();
738
739         float* weight_buf = weights->buffer().as<float*>();
740         std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
741
742         // Mean subtraction
743         auto biases = InferenceEngine::make_shared_blob<float>(td);
744         biases->allocate();
745         float* bias_buf = biases->buffer().as<float*>();
746
747         for (int i = 0; i < numChannels; ++i)
748         {
749             bias_buf[i] = -means[0][i] * scaleFactors[0];
750         }
751
752         InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
753         addConstantData("weights", weights, ieLayer);
754         addConstantData("biases", biases, ieLayer);
755         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
756 #endif  // HAVE_INF_ENGINE
757         return Ptr<BackendNode>();
758     }
759
760     std::vector<String> outNames;
761     // Preprocessing parameters for each network's input.
762     std::vector<double> scaleFactors;
763     std::vector<Scalar> means;
764     std::vector<Mat> inputsData;
765     bool skip;
766 };
767
768 struct BlobManager
769 {
770 public:
771     // Increase references counter to layer output.
772     void addReference(const LayerPin& lp)
773     {
774         std::map<LayerPin, int>::iterator it = refCounter.find(lp);
775         if (it == refCounter.end())
776             refCounter[lp] = 1;
777         else
778             it->second += 1;
779     }
780
781     void addReferences(const std::vector<LayerPin>& pins)
782     {
783         for (int i = 0; i < pins.size(); i++)
784         {
785             addReference(pins[i]);
786         }
787     }
788
789     // Returns number of references to allocated memory that used in specific
790     // layer blob.
791     int numReferences(const LayerPin& lp)
792     {
793         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
794         CV_Assert(mapIt != reuseMap.end());
795         LayerPin memHost = mapIt->second;
796
797         std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
798         CV_Assert(refIt != refCounter.end());
799         return refIt->second;
800     }
801
802     // Reuse data allocated in <host> inside the <user> blob.
803     void reuse(const LayerPin& host, const LayerPin& user)
804     {
805         CV_Assert(reuseMap.find(user) == reuseMap.end());
806         CV_Assert(reuseMap.find(host) != reuseMap.end());
807         LayerPin memHost = reuseMap[host];
808         reuseMap[user] = memHost;
809         if (refCounter.find(memHost) != refCounter.end())
810         {
811             std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
812             if (userRefIt != refCounter.end())
813             {
814                 refCounter[memHost] += userRefIt->second;
815                 refCounter.erase(userRefIt);
816             }
817             else
818                 refCounter[memHost] += 1;
819         }
820     }
821
822     // Decrease references counter to allocated memory inside specific blob.
823     void releaseReference(const LayerPin& lp)
824     {
825         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
826         CV_Assert(mapIt != reuseMap.end());
827
828         std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
829         CV_Assert(refIt != refCounter.end());
830         CV_Assert(refIt->second > 0);
831         refIt->second -= 1;
832     }
833
834     void releaseReferences(const std::vector<LayerPin>& pins)
835     {
836         for (int i = 0; i < pins.size(); i++)
837         {
838             releaseReference(pins[i]);
839         }
840     }
841
842     void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
843     {
844         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
845         {
846             Mat bestBlob;
847             LayerPin bestBlobPin;
848
849             std::map<LayerPin, Mat>::iterator hostIt;
850             std::map<LayerPin, int>::iterator refIt;
851
852             const int targetTotal = total(shape);
853             int bestBlobTotal = INT_MAX;
854
855             for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
856             {
857                 refIt = refCounter.find(hostIt->first);
858                 // Use only blobs that had references before because if not,
859                 // it might be used as output.
860                 if (refIt != refCounter.end() && refIt->second == 0)
861                 {
862                     Mat& unusedBlob = hostIt->second;
863                     if (unusedBlob.total() >= targetTotal &&
864                         unusedBlob.total() < bestBlobTotal)
865                     {
866                         bestBlobPin = hostIt->first;
867                         bestBlob = unusedBlob;
868                         bestBlobTotal = unusedBlob.total();
869                     }
870                 }
871             }
872             if (!bestBlob.empty())
873             {
874                 reuse(bestBlobPin, lp);
875                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
876                 return;
877             }
878         }
879
880         {
881             // if dst already has been allocated with total(shape) elements,
882             // it won't be recreated and pointer of dst.data remains the same.
883             dst.create(shape, use_half ? CV_16S : CV_32F);
884             addHost(lp, dst);
885         }
886     }
887
888     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
889                                std::vector<LayerPin>& pinsForInternalBlobs,
890                                bool use_half = false)
891     {
892         CV_TRACE_FUNCTION();
893
894         pinsForInternalBlobs.clear();
895
896         std::vector<Mat>& outputBlobs = ld.outputBlobs,
897                 &internalBlobs = ld.internals;
898
899         const ShapesVec& outShapes = layerShapes.out,
900                 internalShapes = layerShapes.internal;
901
902         outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
903         internalBlobs.resize(internalShapes.size());
904
905         CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
906
907         // Check that layer could work in-place.
908         bool inPlace = false;
909         if (layerShapes.supportInPlace)
910         {
911             if (ld.inputBlobs.size() == 1)
912             {
913                 // Get number of references to the input memory.
914                 int numRef = numReferences(ld.inputBlobsId[0]);
915                 // If current layer is one and only customer of this blob.
916                 inPlace = numRef == 1;
917             }
918         }
919
920         ShapesVec shapes(outShapes);
921         shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
922         std::vector<Mat*> blobs;
923         for(int i = 0; i < outputBlobs.size(); i++)
924         {
925             blobs.push_back(&outputBlobs[i]);
926         }
927
928         for(int i = 0; i < internalBlobs.size(); i++)
929         {
930             blobs.push_back(&internalBlobs[i]);
931             if (total(internalShapes[i]))
932             {
933                 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
934             }
935         }
936
937         addReferences(pinsForInternalBlobs);
938
939         std::map<int, std::vector<int> > idxSizes;
940         for(int i = 0; i < shapes.size(); i++)
941         {
942             idxSizes[total(shapes[i])].push_back(i);
943         }
944
945         std::map<int, std::vector<int> >::reverse_iterator it;
946         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
947         {
948             for(int j = 0; j < it->second.size(); j++)
949             {
950                 int index = it->second[j];
951                 if (total(shapes[index]))
952                 {
953                     LayerPin blobPin(ld.id, index);
954                     if (index < outShapes.size() && inPlace)
955                     {
956                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
957                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
958                         reuse(ld.inputBlobsId[0], blobPin);
959                     }
960                     else
961                         reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
962                 }
963             }
964         }
965     }
966
967     // Clear internal state. Calls before an every reallocation.
968     void reset()
969     {
970         CV_TRACE_FUNCTION();
971
972         refCounter.clear();
973         reuseMap.clear();
974         memHosts.clear();
975     }
976
977 private:
978     // Register allocated memory.
979     void addHost(const LayerPin& lp, const Mat& mat)
980     {
981         CV_Assert(memHosts.find(lp) == memHosts.end());
982         reuseMap[lp] = lp;
983         memHosts[lp] = mat;
984     }
985
986     std::map<LayerPin, int> refCounter;
987     // Maps pin to origin blob (for whom memory was allocated firstly).
988     // For origin blobs key == value.
989     std::map<LayerPin, LayerPin> reuseMap;
990     std::map<LayerPin, Mat> memHosts;
991 };
992
993 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
994 {
995     if (backendId == DNN_BACKEND_OPENCV)
996     {
997         if (targetId == DNN_TARGET_CPU)
998             return Ptr<BackendWrapper>();
999         else if (IS_DNN_OPENCL_TARGET(targetId))
1000             return OpenCLBackendWrapper::create(m);
1001         else
1002             CV_Error(Error::StsNotImplemented, "Unknown target identifier");
1003     }
1004     else if (backendId == DNN_BACKEND_HALIDE)
1005     {
1006         CV_Assert(haveHalide());
1007 #ifdef HAVE_HALIDE
1008         return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
1009 #endif  // HAVE_HALIDE
1010     }
1011     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
1012     {
1013         CV_Assert(haveInfEngine());
1014 #ifdef HAVE_INF_ENGINE
1015         return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
1016 #endif  // HAVE_INF_ENGINE
1017     }
1018     else if (backendId == DNN_BACKEND_VKCOM)
1019     {
1020         CV_Assert(haveVulkan());
1021 #ifdef HAVE_VULKAN
1022         return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
1023 #endif  // HAVE_VULKAN
1024     }
1025     else if (backendId == DNN_BACKEND_CUDA)
1026     {
1027         CV_Assert(haveCUDA());
1028
1029 #ifdef HAVE_CUDA
1030         switch (targetId)
1031         {
1032         case DNN_TARGET_CUDA:
1033             return CUDABackendWrapperFP32::create(m);
1034         case DNN_TARGET_CUDA_FP16:
1035             return CUDABackendWrapperFP16::create(m);
1036         default:
1037             CV_Assert(IS_DNN_CUDA_TARGET(targetId));
1038         }
1039 #endif
1040     }
1041     else
1042         CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1043     return Ptr<BackendWrapper>();
1044 }
1045
1046 struct Net::Impl
1047 {
1048     typedef std::map<int, LayerShapes> LayersShapesMap;
1049     typedef std::map<int, LayerData> MapIdToLayerData;
1050
1051     Impl()
1052     {
1053         //allocate fake net input layer
1054         netInputLayer = Ptr<DataLayer>(new DataLayer());
1055         LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
1056         inpl.id = 0;
1057         netInputLayer->name = inpl.name = "_input";
1058         inpl.type = "__NetInputLayer__";
1059         inpl.layerInstance = netInputLayer;
1060         layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
1061
1062         lastLayerId = 0;
1063         netWasAllocated = false;
1064         fusion = true;
1065         isAsync = false;
1066         preferableBackend = DNN_BACKEND_DEFAULT;
1067         preferableTarget = DNN_TARGET_CPU;
1068         skipInfEngineInit = false;
1069
1070 #ifdef HAVE_CUDA
1071         if (cv::cuda::getCudaEnabledDeviceCount() > 0)
1072         {
1073             cuda4dnn::csl::CSLContext context;
1074             context.stream = cuda4dnn::csl::Stream(true);
1075             context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
1076             context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
1077
1078             cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context)));
1079         }
1080 #endif
1081     }
1082
1083     Ptr<DataLayer> netInputLayer;
1084     std::vector<LayerPin> blobsToKeep;
1085     MapIdToLayerData layers;
1086     std::map<String, int> layerNameToId;
1087     BlobManager blobManager;
1088     int preferableBackend;
1089     int preferableTarget;
1090     String halideConfigFile;
1091     bool skipInfEngineInit;
1092     // Map host data to backend specific wrapper.
1093     std::map<void*, Ptr<BackendWrapper> > backendWrappers;
1094
1095     int lastLayerId;
1096
1097     bool netWasAllocated;
1098     bool fusion;
1099     bool isAsync;
1100     std::vector<int64> layersTimings;
1101     Mat output_blob;
1102
1103 #ifdef HAVE_CUDA
1104     struct CudaInfo_t
1105     {
1106         CudaInfo_t(cuda4dnn::csl::CSLContext ctxt) : context(std::move(ctxt)) { }
1107         cuda4dnn::csl::CSLContext context;
1108         cuda4dnn::csl::Workspace workspace;
1109     };
1110
1111     std::unique_ptr<CudaInfo_t> cudaInfo;
1112 #endif
1113
1114     Ptr<BackendWrapper> wrap(Mat& host)
1115     {
1116         if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
1117             return Ptr<BackendWrapper>();
1118
1119         MatShape shape(host.dims);
1120         for (int i = 0; i < host.dims; ++i)
1121             shape[i] = host.size[i];
1122
1123         void* data = host.data;
1124         if (backendWrappers.find(data) != backendWrappers.end())
1125         {
1126             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
1127             if (preferableBackend == DNN_BACKEND_OPENCV)
1128             {
1129                 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
1130                 return OpenCLBackendWrapper::create(baseBuffer, host);
1131             }
1132             else if (preferableBackend == DNN_BACKEND_HALIDE)
1133             {
1134                 CV_Assert(haveHalide());
1135   #ifdef HAVE_HALIDE
1136                 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
1137   #endif  // HAVE_HALIDE
1138             }
1139             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1140             {
1141                 return wrapMat(preferableBackend, preferableTarget, host);
1142             }
1143             else if (preferableBackend == DNN_BACKEND_VKCOM)
1144             {
1145   #ifdef HAVE_VULKAN
1146                 return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
1147   #endif
1148             }
1149             else if (preferableBackend == DNN_BACKEND_CUDA)
1150             {
1151                 CV_Assert(haveCUDA());
1152 #ifdef HAVE_CUDA
1153                 switch (preferableTarget)
1154                 {
1155                 case DNN_TARGET_CUDA:
1156                     return CUDABackendWrapperFP32::create(baseBuffer, shape);
1157                 case DNN_TARGET_CUDA_FP16:
1158                     return CUDABackendWrapperFP16::create(baseBuffer, shape);
1159                 default:
1160                     CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
1161                 }
1162 #endif
1163             }
1164             else
1165                 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1166         }
1167
1168         Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
1169         backendWrappers[data] = wrapper;
1170         return wrapper;
1171     }
1172
1173 #ifdef HAVE_HALIDE
1174     void compileHalide()
1175     {
1176         CV_TRACE_FUNCTION();
1177
1178         CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
1179
1180         HalideScheduler scheduler(halideConfigFile);
1181         std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
1182         for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
1183         {
1184             LayerData &ld = it->second;
1185             Ptr<Layer> layer = ld.layerInstance;
1186             if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
1187             {
1188                 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
1189                 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
1190                 if (!scheduled)
1191                 {
1192                     // Use automatic scheduling provided by layer.
1193                     layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1194                                                 ld.inputBlobs, ld.outputBlobs,
1195                                                 preferableTarget);
1196                 }
1197                 compileList.emplace_back(ld);
1198             }
1199         }
1200         std::atomic<int> progress(0);
1201         auto fn = ([&] () -> void
1202         {
1203             for (;;)
1204             {
1205                 int id = progress.fetch_add(1);
1206                 if ((size_t)id >= compileList.size())
1207                     return;
1208                 const LayerData& ld = compileList[id].get();
1209                 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1210                 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1211             }
1212         });
1213         size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1214         num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1215         std::vector<std::thread> threads(num_threads - 1);
1216         for (auto& t: threads) t = std::thread(fn);
1217         fn(); // process own tasks
1218         for (auto& t: threads) t.join();
1219     }
1220 #endif
1221
1222     void clear()
1223     {
1224         CV_TRACE_FUNCTION();
1225
1226         MapIdToLayerData::iterator it;
1227         for (it = layers.begin(); it != layers.end(); it++)
1228         {
1229             if (it->second.id != 0) {
1230                 it->second.inputBlobs.clear();
1231                 it->second.outputBlobs.clear();
1232                 it->second.internals.clear();
1233             }
1234             it->second.skip = false;
1235             //it->second.consumers.clear();
1236             Ptr<Layer> currLayer = it->second.layerInstance;
1237
1238             if( currLayer.empty() )
1239                 continue;
1240
1241             currLayer->unsetAttached();
1242         }
1243
1244         layersTimings.clear();
1245     }
1246
1247     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1248     {
1249         CV_TRACE_FUNCTION();
1250
1251         if (preferableBackend == DNN_BACKEND_DEFAULT)
1252             preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1253
1254         CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1255                   preferableTarget == DNN_TARGET_CPU ||
1256                   preferableTarget == DNN_TARGET_OPENCL ||
1257                   preferableTarget == DNN_TARGET_OPENCL_FP16);
1258         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1259                   preferableTarget == DNN_TARGET_CPU ||
1260                   preferableTarget == DNN_TARGET_OPENCL);
1261         CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
1262                   preferableTarget == DNN_TARGET_CPU ||
1263                   preferableTarget == DNN_TARGET_OPENCL ||
1264                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1265                   preferableTarget == DNN_TARGET_MYRIAD ||
1266                   preferableTarget == DNN_TARGET_FPGA);
1267         CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
1268                   preferableTarget == DNN_TARGET_VULKAN);
1269         CV_Assert(preferableBackend != DNN_BACKEND_CUDA ||
1270                   IS_DNN_CUDA_TARGET(preferableTarget));
1271
1272         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1273         {
1274             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1275 #ifndef HAVE_OPENCL
1276             {
1277                 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1278                 preferableTarget = DNN_TARGET_CPU;
1279             }
1280 #else
1281             {
1282                 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1283                 {
1284                     // Current implementation is only valid for GPU (#11494)
1285                     if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1286                     {
1287                         CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1288                         preferableTarget = DNN_TARGET_CPU;
1289                     }
1290                     else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1291                     {
1292                         CV_LOG_WARNING(NULL,
1293                             "DNN: OpenCL target with fp16 precision is not supported "
1294                             "with current OpenCL device (tested with Intel GPUs only), "
1295                             "switching to OpenCL with fp32 precision.");
1296                         preferableTarget = DNN_TARGET_OPENCL;
1297                     }
1298                 }
1299             }
1300 #endif
1301             if (preferableBackend == DNN_BACKEND_VKCOM && !haveVulkan())
1302             {
1303                 preferableBackend = DNN_BACKEND_OPENCV;
1304                 preferableTarget = DNN_TARGET_CPU;
1305             }
1306
1307             if (preferableBackend == DNN_BACKEND_CUDA && !haveCUDA())
1308             {
1309 #ifdef HAVE_CUDA
1310                 CV_LOG_WARNING(NULL, "unable to use CUDA backend; switching to CPU");
1311 #else
1312                 CV_LOG_WARNING(NULL, "DNN module was not built with CUDA backend; switching to CPU");
1313 #endif
1314                 preferableBackend = DNN_BACKEND_OPENCV;
1315                 preferableTarget = DNN_TARGET_CPU;
1316             }
1317
1318             clear();
1319
1320             allocateLayers(blobsToKeep_);
1321
1322             MapIdToLayerData::iterator it = layers.find(0);
1323             CV_Assert(it != layers.end());
1324             it->second.skip = netInputLayer->skip;
1325
1326             initBackend();
1327
1328             if (!netWasAllocated)
1329             {
1330 #ifdef HAVE_HALIDE
1331                 if (preferableBackend == DNN_BACKEND_HALIDE)
1332                     compileHalide();
1333 #else
1334                 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1335 #endif
1336             }
1337
1338             netWasAllocated = true;
1339             this->blobsToKeep = blobsToKeep_;
1340         }
1341     }
1342
1343     int getLayerId(const String &layerName)
1344     {
1345         std::map<String, int>::iterator it = layerNameToId.find(layerName);
1346         return (it != layerNameToId.end()) ? it->second : -1;
1347     }
1348
1349     int getLayerId(int id)
1350     {
1351         MapIdToLayerData::iterator it = layers.find(id);
1352         return (it != layers.end()) ? id : -1;
1353     }
1354
1355     int getLayerId(DictValue &layerDesc)
1356     {
1357         if (layerDesc.isInt())
1358             return getLayerId(layerDesc.get<int>());
1359         else if (layerDesc.isString())
1360             return getLayerId(layerDesc.get<String>());
1361
1362         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1363         return -1;
1364     }
1365
1366     String getLayerName(int id)
1367     {
1368         MapIdToLayerData::iterator it = layers.find(id);
1369         return (it != layers.end()) ? it->second.name : "(unknown layer)";
1370     }
1371
1372     LayerData& getLayerData(int id)
1373     {
1374         MapIdToLayerData::iterator it = layers.find(id);
1375
1376         if (it == layers.end())
1377             CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1378
1379         return it->second;
1380     }
1381
1382     LayerData& getLayerData(const String &layerName)
1383     {
1384         int id = getLayerId(layerName);
1385
1386         if (id < 0)
1387             CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1388
1389         return getLayerData(id);
1390     }
1391
1392     LayerData& getLayerData(const DictValue &layerDesc)
1393     {
1394         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1395         if (layerDesc.isInt())
1396             return getLayerData(layerDesc.get<int>());
1397         else /*if (layerDesc.isString())*/
1398             return getLayerData(layerDesc.get<String>());
1399     }
1400
1401     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1402     {
1403         if ((int)ld.inputBlobsId.size() <= inNum)
1404         {
1405             ld.inputBlobsId.resize(inNum + 1);
1406         }
1407         else
1408         {
1409             LayerPin storedFrom = ld.inputBlobsId[inNum];
1410             if (storedFrom.valid() && !storedFrom.equal(from))
1411                 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1412                                                  inNum, ld.name.c_str()));
1413         }
1414
1415         ld.inputBlobsId[inNum] = from;
1416     }
1417
1418     int resolvePinOutputName(LayerData &ld, const String &outName)
1419     {
1420         if (outName.empty())
1421             return 0;
1422         return ld.getLayerInstance()->outputNameToIndex(outName);
1423     }
1424
1425     LayerPin getPinByAlias(const String &layerName)
1426     {
1427         LayerPin pin;
1428         pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1429
1430         if (pin.lid >= 0)
1431             pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1432
1433         return pin;
1434     }
1435
1436     std::vector<LayerPin> getLayerOutPins(const String &layerName)
1437     {
1438         int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1439
1440         std::vector<LayerPin> pins;
1441
1442         for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1443         {
1444             pins.push_back(LayerPin(lid, i));
1445         }
1446
1447         return pins;
1448     }
1449
1450     void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1451     {
1452         CV_Assert(outLayerId < inLayerId);
1453         LayerData &ldOut = getLayerData(outLayerId);
1454         LayerData &ldInp = getLayerData(inLayerId);
1455
1456         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1457         ldOut.requiredOutputs.insert(outNum);
1458         ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1459     }
1460
1461     void initBackend()
1462     {
1463         CV_TRACE_FUNCTION();
1464         if (preferableBackend == DNN_BACKEND_OPENCV)
1465             CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1466         else if (preferableBackend == DNN_BACKEND_HALIDE)
1467             initHalideBackend();
1468         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1469             initInfEngineBackend();
1470         else if (preferableBackend == DNN_BACKEND_VKCOM)
1471             initVkComBackend();
1472         else if (preferableBackend == DNN_BACKEND_CUDA)
1473             initCUDABackend();
1474         else
1475             CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1476     }
1477
1478     void initHalideBackend()
1479     {
1480         CV_TRACE_FUNCTION();
1481         CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1482
1483         // Iterator to current layer.
1484         MapIdToLayerData::iterator it = layers.begin();
1485         // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1486         // it'll be a conv layer.
1487         MapIdToLayerData::iterator baseIt = layers.begin();
1488         for (; it != layers.end(); it++)
1489         {
1490             LayerData &ldTop = it->second;
1491             Ptr<Layer> layerTop = ldTop.layerInstance;
1492             if (!layerTop->supportBackend(preferableBackend))
1493             {
1494                 // Move base iterator to layer that don't support preferable
1495                 // backend to prevent fusion over layer of different backend.
1496                 baseIt = it;
1497                 continue;
1498             }
1499             // Try to do layers fusion.
1500             LayerData &ldBot = baseIt->second;
1501             Ptr<Layer> layerBot = ldBot.layerInstance;
1502             // 1. Check that bottom and top from the same backends.
1503             if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1504             {
1505                 // 2. Check that current layer works in-place.
1506                 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1507                                ldBot.outputBlobs.size() == 1 &&
1508                                ldTop.inputBlobs[0]->data ==
1509                                ldBot.outputBlobs[0].data;
1510                 if (inPlace)
1511                 {
1512                     // 3. Try to attach node.
1513                     CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1514                     Ptr<BackendNode> fusedNode =
1515                         layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1516                     if (!fusedNode.empty())
1517                     {
1518                         ldTop.skip = true;
1519                         ldBot.backendNodes[preferableBackend] = fusedNode;
1520                         ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1521                         continue;
1522                     }
1523                 }
1524             }
1525             // No layers fusion.
1526             ldTop.skip = false;
1527             ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1528                 layerTop->initHalide(ldTop.inputBlobsWrappers);
1529             baseIt = it;
1530         }
1531     }
1532
1533 #ifdef HAVE_INF_ENGINE
1534     // Before launching Inference Engine graph we need to specify output blobs.
1535     // This function requests output blobs based on inputs references of
1536     // layers from default backend or layers from different graphs.
1537     void addInfEngineNetOutputs(LayerData &ld)
1538     {
1539         Ptr<InfEngineBackendNet> layerNet;
1540         if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1541         {
1542             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1543             if (!node.empty())
1544             {
1545                 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1546                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1547                 layerNet = ieNode->net;
1548             }
1549         }
1550         // For an every input reference we check that it belongs to one of
1551         // the Inference Engine backend graphs. Request an output blob if it is.
1552         // Do nothing if layer's input is from the same graph.
1553         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1554         {
1555             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1556             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1557             if (!inpNode.empty())
1558             {
1559                 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1560                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1561                 if (layerNet != ieInpNode->net)
1562                 {
1563                     // layerNet is empty or nodes are from different graphs.
1564                     ieInpNode->net->addOutput(ieInpNode->layer.getName());
1565                 }
1566             }
1567         }
1568     }
1569 #endif  // HAVE_INF_ENGINE
1570
1571     void initVkComBackend()
1572     {
1573         CV_TRACE_FUNCTION();
1574         CV_Assert(preferableBackend == DNN_BACKEND_VKCOM);
1575 #ifdef HAVE_VULKAN
1576         if (!haveVulkan())
1577             return;
1578
1579         MapIdToLayerData::iterator it = layers.begin();
1580         for (; it != layers.end(); it++)
1581         {
1582             LayerData &ld = it->second;
1583             Ptr<Layer> layer = ld.layerInstance;
1584             if (!layer->supportBackend(preferableBackend))
1585             {
1586                 continue;
1587             }
1588
1589             ld.skip = false;
1590
1591             try
1592             {
1593                 ld.backendNodes[DNN_BACKEND_VKCOM] =
1594                     layer->initVkCom(ld.inputBlobsWrappers);
1595             }
1596             catch (const cv::Exception& e)
1597             {
1598                 CV_LOG_ERROR(NULL, "initVkCom failed, fallback to CPU implementation. " << e.what());
1599                 ld.backendNodes[DNN_BACKEND_VKCOM] = Ptr<BackendNode>();
1600             }
1601         }
1602 #endif
1603     }
1604
1605     void initInfEngineBackend()
1606     {
1607         CV_TRACE_FUNCTION();
1608         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE, haveInfEngine());
1609 #ifdef HAVE_INF_ENGINE
1610         MapIdToLayerData::iterator it;
1611         Ptr<InfEngineBackendNet> net;
1612
1613         for (it = layers.begin(); it != layers.end(); ++it)
1614         {
1615             LayerData &ld = it->second;
1616             if (ld.id == 0)
1617             {
1618                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1619                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1620                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1621                 {
1622                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1623 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1624                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1625 #else
1626                     dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
1627 #endif
1628                 }
1629             }
1630             else
1631             {
1632                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1633                 {
1634                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1635 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1636                     dataPtr->name = ld.name;
1637 #else
1638                     dataPtr->setName(ld.name);
1639 #endif
1640                 }
1641             }
1642         }
1643
1644         if (skipInfEngineInit)
1645         {
1646             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1647             CV_Assert(!node.empty());
1648
1649             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1650             CV_Assert(!ieNode.empty());
1651
1652             for (it = layers.begin(); it != layers.end(); ++it)
1653             {
1654                 LayerData &ld = it->second;
1655                 if (ld.id == 0)
1656                 {
1657                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1658                     {
1659                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1660 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1661                         dataPtr->name = netInputLayer->outNames[i];
1662 #else
1663                         dataPtr->setName(netInputLayer->outNames[i]);
1664 #endif
1665                     }
1666                 }
1667                 else
1668                 {
1669                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1670                     {
1671                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1672 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1673                         dataPtr->name = ld.name;
1674 #else
1675                         dataPtr->setName(ld.name);
1676 #endif
1677                     }
1678                 }
1679                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1680                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1681                 ld.skip = true;
1682             }
1683             layers[lastLayerId].skip = false;
1684             ieNode->net->init(preferableTarget);
1685             return;
1686         }
1687
1688         // Build Inference Engine networks from sets of layers that support this
1689         // backend. Split a whole model on several Inference Engine networks if
1690         // some of layers are not implemented.
1691
1692         // Set of all input and output blobs wrappers for current network.
1693         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1694         for (it = layers.begin(); it != layers.end(); ++it)
1695         {
1696             LayerData &ld = it->second;
1697             if (ld.id == 0 && ld.skip)
1698                 continue;
1699             bool fused = ld.skip;
1700
1701             Ptr<Layer> layer = ld.layerInstance;
1702             if (!fused && !layer->supportBackend(preferableBackend))
1703             {
1704                 bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
1705                                     INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2);
1706                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
1707                 if (preferableTarget == DNN_TARGET_MYRIAD)
1708                 {
1709                     for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
1710                     {
1711                         customizable = ld.inputBlobs[i]->size[0] == 1;
1712                     }
1713                 }
1714
1715                 // TODO: fix these workarounds
1716                 if (preferableTarget == DNN_TARGET_MYRIAD ||
1717                     preferableTarget == DNN_TARGET_OPENCL ||
1718                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1719                     customizable &= ld.type != "Concat";
1720
1721                 if (preferableTarget == DNN_TARGET_OPENCL ||
1722                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1723                     customizable &= ld.type != "Power";
1724
1725                 if (preferableTarget == DNN_TARGET_OPENCL)
1726                     customizable &= ld.type != "Eltwise";
1727
1728                 if (!customizable)
1729                 {
1730                     addInfEngineNetOutputs(ld);
1731                     net = Ptr<InfEngineBackendNet>();
1732                     netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1733                     layer->preferableTarget = DNN_TARGET_CPU;
1734                     continue;
1735                 }
1736             }
1737             ld.skip = true;  // Initially skip all Inference Engine supported layers.
1738
1739             // Create a new network if one of inputs from different Inference Engine graph.
1740             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1741             {
1742                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1743                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1744                 if (!inpNode.empty())
1745                 {
1746                     Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1747                     CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1748                     if (ieInpNode->net != net)
1749                     {
1750                         net = Ptr<InfEngineBackendNet>();
1751                         netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1752                         break;
1753                     }
1754                 }
1755             }
1756
1757             Ptr<BackendNode> node;
1758             if (!net.empty())
1759             {
1760                 if (fused)
1761                 {
1762                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1763                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1764                     CV_Assert(inPlace);
1765                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1766                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1767                 }
1768             }
1769             else
1770                 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1771
1772             if (!fused)
1773             {
1774                 if (layer->supportBackend(preferableBackend))
1775                     node = layer->initInfEngine(ld.inputBlobsWrappers);
1776                 else
1777                 {
1778                     node = Ptr<BackendNode>(new InfEngineBackendNode(
1779                         ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
1780                 }
1781             }
1782             else if (node.empty())
1783                 continue;
1784
1785             CV_Assert(!node.empty());
1786             ld.backendNodes[preferableBackend] = node;
1787
1788             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1789             CV_Assert(!ieNode.empty());
1790             ieNode->net = net;
1791
1792             // Convert weights in FP16 for specific targets.
1793             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1794                  preferableTarget == DNN_TARGET_MYRIAD ||
1795                  preferableTarget == DNN_TARGET_FPGA) && !fused)
1796             {
1797 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
1798                 for (const std::string& name : {"weights", "biases"})
1799                 {
1800                     auto it = ieNode->layer.getParameters().find(name);
1801                     if (it != ieNode->layer.getParameters().end())
1802                     {
1803                         InferenceEngine::Blob::Ptr bp = it->second.as<InferenceEngine::Blob::Ptr>();
1804                         it->second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(bp));
1805                     }
1806                 }
1807 #else
1808                 auto& blobs = ieNode->layer.getConstantData();
1809                 if (blobs.empty())
1810                 {
1811                     // In case of non weightable layer we have to specify
1812                     // it's precision adding dummy blob.
1813                     auto blob = InferenceEngine::make_shared_blob<int16_t>(
1814                                     InferenceEngine::Precision::FP16,
1815                                     InferenceEngine::Layout::C, {1});
1816                     blob->allocate();
1817                     blobs[""] = blob;
1818                 }
1819                 else
1820                 {
1821                     for (auto& it : blobs)
1822                         it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
1823                 }
1824 #endif
1825             }
1826
1827             if (!fused)
1828                 net->addLayer(ieNode->layer);
1829
1830             net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
1831             net->addBlobs(ld.inputBlobsWrappers);
1832             net->addBlobs(ld.outputBlobsWrappers);
1833             addInfEngineNetOutputs(ld);
1834         }
1835
1836         // Initialize all networks.
1837         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1838         {
1839             LayerData &ld = it->second;
1840             if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1841                 continue;
1842
1843             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1844             if (node.empty())
1845                 continue;
1846
1847             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1848             if (ieNode.empty())
1849                 continue;
1850
1851             CV_Assert(!ieNode->net.empty());
1852
1853             if (!ieNode->net->isInitialized())
1854             {
1855                 ieNode->net->init(preferableTarget);
1856                 ld.skip = false;
1857             }
1858         }
1859 #endif  // HAVE_INF_ENGINE
1860     }
1861
1862     void initCUDABackend() {
1863         CV_Assert(haveCUDA());
1864
1865 #ifdef HAVE_CUDA
1866         for (auto& layer : layers)
1867         {
1868             auto& ld = layer.second;
1869             auto& layerInstance = ld.layerInstance;
1870
1871             if (!layerInstance->supportBackend(DNN_BACKEND_CUDA))
1872             {
1873                 std::ostringstream os;
1874                 os << "CUDA backend will fallback to the CPU implementation for the layer \"" << ld.name
1875                    << "\" of type " << ld.type << '\n';
1876                 CV_LOG_INFO(NULL, os.str().c_str());
1877                 continue;
1878             }
1879
1880             /* we make a copy so that `initCUDA` doesn't modify `cudaInfo->context` */
1881             auto context = cudaInfo->context;
1882             auto node = layerInstance->initCUDA(&context, ld.inputBlobsWrappers, ld.outputBlobsWrappers);
1883             ld.backendNodes[DNN_BACKEND_CUDA] = node;
1884
1885             auto cudaNode = node.dynamicCast<CUDABackendNode>();
1886             cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
1887         }
1888 #endif
1889     }
1890
1891     void allocateLayer(int lid, const LayersShapesMap& layersShapes)
1892     {
1893         CV_TRACE_FUNCTION();
1894
1895         LayerData &ld = layers[lid];
1896
1897         //already allocated
1898         if (ld.flag)
1899             return;
1900
1901         size_t ninputs = ld.inputBlobsId.size();
1902 #if 0
1903         printf("layer %s:", ld.name.c_str());
1904         for (size_t i = 0; i < ninputs; i++)
1905         {
1906             int inp_lid = ld.inputBlobsId[i].lid;
1907             LayerData &inp_ld = layers[inp_lid];
1908             int inp_outputs = (int)inp_ld.outputBlobs.size();
1909             std::cout << " " << inp_ld.name << "(" << inp_outputs;
1910
1911             for( int j = 0; j < inp_outputs; j++ )
1912             {
1913                 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
1914             }
1915             std::cout << ")";
1916         }
1917         printf("\n");
1918 #endif
1919
1920         //determine parent layers
1921         for (size_t i = 0; i < ninputs; i++)
1922             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
1923
1924         //allocate parents
1925         for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
1926             allocateLayer(*i, layersShapes);
1927
1928         //bind inputs
1929         if (ld.id == 0)  // DataLayer
1930         {
1931             ninputs = netInputLayer->inputsData.size();
1932             ld.inputBlobsWrappers.resize(ninputs);
1933             for (size_t i = 0; i < ninputs; i++)
1934             {
1935                 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
1936 #ifdef HAVE_CUDA
1937                 if (IS_DNN_CUDA_TARGET(preferableTarget))
1938                 {
1939                     auto wrapper = ld.inputBlobsWrappers[i].dynamicCast<CUDABackendWrapper>();
1940                     wrapper->setStream(cudaInfo->context.stream);
1941                 }
1942 #endif
1943             }
1944         }
1945         else
1946         {
1947             ld.inputBlobs.resize(ninputs);
1948             ld.inputBlobsWrappers.resize(ninputs);
1949             for (size_t i = 0; i < ninputs; i++)
1950             {
1951                 LayerPin from = ld.inputBlobsId[i];
1952                 CV_Assert(from.valid());
1953                 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
1954                 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
1955                 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
1956             }
1957         }
1958
1959         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
1960
1961         CV_Assert(layerShapesIt != layersShapes.end());
1962
1963         std::vector<LayerPin> pinsForInternalBlobs;
1964         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
1965                                           preferableBackend == DNN_BACKEND_OPENCV &&
1966                                           preferableTarget == DNN_TARGET_OPENCL_FP16);
1967         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
1968         for (int i = 0; i < ld.outputBlobs.size(); ++i)
1969         {
1970             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
1971 #ifdef HAVE_CUDA
1972             if (IS_DNN_CUDA_TARGET(preferableTarget))
1973             {
1974                 auto wrapper = ld.outputBlobsWrappers[i].dynamicCast<CUDABackendWrapper>();
1975                 wrapper->setStream(cudaInfo->context.stream);
1976             }
1977 #endif
1978         }
1979
1980         /* CUDA backend has its own system for internal blobs; we don't need these */
1981         ld.internalBlobsWrappers.resize((preferableBackend == DNN_BACKEND_CUDA) ? 0 : ld.internals.size());
1982         for (int i = 0; i < ld.internalBlobsWrappers.size(); ++i)
1983         {
1984             ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
1985         }
1986
1987         Ptr<Layer> layerPtr = ld.getLayerInstance();
1988         {
1989             std::vector<Mat> inps(ld.inputBlobs.size());
1990             for (int i = 0; i < ld.inputBlobs.size(); ++i)
1991             {
1992                 inps[i] = *ld.inputBlobs[i];
1993             }
1994             layerPtr->finalize(inps, ld.outputBlobs);
1995             layerPtr->preferableTarget = preferableTarget;
1996 #if 0
1997             std::cout << "\toutputs:";
1998             size_t noutputs = ld.outputBlobs.size();
1999             for (size_t j = 0; j < noutputs; j++)
2000             {
2001                 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
2002             }
2003             std::cout << "\n";
2004 #endif
2005         }
2006
2007         // After allocation of layer, we decrease counters to it's input blobs.
2008         blobManager.releaseReferences(ld.inputBlobsId);
2009         blobManager.releaseReferences(pinsForInternalBlobs);
2010
2011         ld.flag = 1;
2012     }
2013
2014 #if 0
2015 #define printf_(args) printf args
2016 #else
2017 #define printf_(args)
2018 #endif
2019
2020     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
2021     {
2022         if( !fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
2023                         preferableBackend != DNN_BACKEND_CUDA &&
2024                         preferableBackend != DNN_BACKEND_INFERENCE_ENGINE))
2025             return;
2026
2027         CV_TRACE_FUNCTION();
2028
2029         // scan through all the layers. If there is convolution layer followed by the activation layer,
2030         // we try to embed this activation into the convolution and disable separate execution of the activation
2031         std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
2032                                       blobsToKeep_.end());
2033         MapIdToLayerData::iterator it;
2034         for (it = layers.begin(); it != layers.end(); it++)
2035         {
2036             int lid = it->first;
2037             LayerData& ld = layers[lid];
2038             if( ld.skip )
2039             {
2040                 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
2041                 continue;
2042             }
2043             printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
2044
2045             // the optimization #1. try to fuse batch norm, scaling and/or activation layers
2046             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
2047             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
2048             // some other layers.
2049             Ptr<Layer>& currLayer = ld.layerInstance;
2050             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
2051             {
2052                 LayerData* nextData = &layers[ld.consumers[0].lid];
2053                 LayerPin lpNext(ld.consumers[0].lid, 0);
2054                 while (nextData)
2055                 {
2056                     Ptr<Layer> nextLayer = nextData->layerInstance;
2057                     if (currLayer->tryFuse(nextLayer))
2058                     {
2059                         printf_(("\tfused with %s\n", nextLayer->name.c_str()));
2060                         nextData->skip = true;
2061                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
2062                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
2063                         if (nextData->consumers.size() == 1)
2064                         {
2065                             int nextLayerId = nextData->consumers[0].lid;
2066                             nextData = &layers[nextLayerId];
2067                             lpNext = LayerPin(nextLayerId, 0);
2068                         }
2069                         else
2070                         {
2071                             nextData = 0;
2072                             break;
2073                         }
2074                     }
2075                     else
2076                         break;
2077                 }
2078
2079                 if (preferableBackend != DNN_BACKEND_OPENCV)
2080                     continue;  // Go to the next layer.
2081
2082                 // TODO: OpenCL target support more fusion styles.
2083                 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
2084                      (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
2085                      ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
2086                      ld.layerInstance->type != "Concat")) )
2087                     continue;
2088
2089                 while (nextData)
2090                 {
2091                     // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
2092                     if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
2093                         nextData->type != "ReLU" &&
2094                         nextData->type != "ChannelsPReLU" &&
2095                         nextData->type != "ReLU6" &&
2096                         nextData->type != "TanH" &&
2097                         nextData->type != "Power")
2098                         break;
2099
2100                     Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2101                     if (nextActivLayer.empty())
2102                         break;
2103
2104                     if (currLayer->setActivation(nextActivLayer))
2105                     {
2106                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2107                         nextData->skip = true;
2108                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
2109                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
2110                         if (nextData->consumers.size() == 1)
2111                         {
2112                             int nextLayerId = nextData->consumers[0].lid;
2113                             nextData = &layers[nextLayerId];
2114                             lpNext = LayerPin(nextLayerId, 0);
2115                         }
2116                         else
2117                         {
2118                             nextData = 0;
2119                             break;
2120                         }
2121                     }
2122                     else
2123                         break;
2124                 }
2125
2126                 // fuse convolution layer followed by eltwise + relu
2127                 if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
2128                 {
2129                     Ptr<EltwiseLayer> nextEltwiseLayer;
2130                     if( nextData )
2131                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
2132
2133                     if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2134                         nextData && nextData->inputBlobsId.size() == 2 )
2135                     {
2136                         LayerData *eltwiseData = nextData;
2137
2138                         // Eltwise layer has two inputs. We need to determine which
2139                         // is a base convolution layer and which could be used as it's bias.
2140                         LayerData* biasLayerData = 0;
2141                         for (int i = 0; i < 2; ++i)
2142                         {
2143                             LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
2144                             CV_Assert(downLayerData);
2145                             while (downLayerData->skip)
2146                             {
2147                                 if (downLayerData->inputBlobsId.size() == 1)
2148                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
2149                                 else
2150                                 {
2151                                     downLayerData = 0;
2152                                     break;
2153                                 }
2154                             }
2155                             if (downLayerData && ld.id == downLayerData->id)
2156                             {
2157                                 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
2158                                 break;
2159                             }
2160                         }
2161                         CV_Assert(biasLayerData);
2162                         {
2163                             if( eltwiseData->consumers.size() == 1 )
2164                             {
2165                                 // fuse eltwise + activation layer
2166                                 if (biasLayerData->id < ld.id)
2167                                 {
2168                                     nextData = &layers[eltwiseData->consumers[0].lid];
2169                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
2170                                     Ptr<ActivationLayer> nextActivLayer;
2171                                     if( nextData )
2172                                         nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2173
2174                                     if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2175                                             (!nextData->type.compare("ReLU") ||
2176                                              !nextData->type.compare("ChannelsPReLU") ||
2177                                              !nextData->type.compare("Power")) &&
2178                                             currLayer->setActivation(nextActivLayer) )
2179                                     {
2180                                         CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
2181                                         ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
2182                                         printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
2183                                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2184                                         eltwiseData->skip = true;
2185                                         nextData->skip = true;
2186                                         // This optimization for cases like
2187                                         // some_layer   conv
2188                                         //   |             |
2189                                         //   +-- eltwise --+
2190                                         //          |
2191                                         //        activ
2192                                         // This way all the element-wise computations
2193                                         // (i.e. some_layer+conv or some_layer*conv)
2194                                         // would be done at [conv] layer. So we need to
2195                                         // replace [conv]'s output blob to [eltwise]'s one
2196                                         // considering that [activ] is an in-place layer.
2197                                         // Also we need to move all the consumers' references.
2198                                         // To prevent memory collisions (i.e. when input of
2199                                         // [conv] and output of [eltwise] is the same blob)
2200                                         // we allocate a new blob.
2201                                         CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
2202                                         ld.outputBlobs[0] = ld.outputBlobs[0].clone();
2203                                         ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
2204
2205                                         eltwiseData->outputBlobs = ld.outputBlobs;
2206                                         nextData->outputBlobs = ld.outputBlobs;
2207                                         eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
2208                                         nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
2209
2210                                         // Move references of [activ] layer consumers to the newly allocated blob.
2211                                         for (int i = 0; i < nextData->consumers.size(); ++i)
2212                                         {
2213                                             LayerData& consumer = layers[nextData->consumers[i].lid];
2214                                             for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
2215                                             {
2216                                                 if (consumer.inputBlobsId[j].lid == lpNext.lid)
2217                                                 {
2218                                                     consumer.inputBlobs[j] = &ld.outputBlobs[0];
2219                                                     consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
2220                                                     break;
2221                                                 }
2222                                             }
2223                                         }
2224                                     }
2225                                 }
2226                             }
2227                         }
2228                     }
2229                 }
2230             }
2231
2232             if (preferableBackend != DNN_BACKEND_OPENCV)
2233                 continue;  // Go to the next layer.
2234
2235             // the optimization #2. if there is concat layer that concatenates channels
2236             // from the inputs together (i.e. axis == 1) then we make the inputs of
2237             // the concat layer to write to the concatenation output buffer
2238             // (and so we eliminate the concatenation layer, because the channels
2239             // are concatenated implicitly).
2240             Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
2241             if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
2242                 ld.outputBlobs.size() == 1 )
2243             {
2244                 Mat& output = ld.outputBlobs[0];
2245                 UMat umat_output;
2246                 if (!ld.outputBlobsWrappers.empty() &&
2247                     (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
2248                 {
2249                     size_t i, ninputs = ld.inputBlobsId.size();
2250                     bool conv_layer = true;
2251                     for( i = 0; i < ninputs; i++ )
2252                     {
2253                         LayerPin pin = ld.inputBlobsId[i];
2254                         LayerData* inp_i_data = &layers[pin.lid];
2255                         while(inp_i_data->skip &&
2256                               inp_i_data->inputBlobsId.size() == 1 &&
2257                               inp_i_data->consumers.size() == 1)
2258                         {
2259                             pin = inp_i_data->inputBlobsId[0];
2260                             inp_i_data = &layers[pin.lid];
2261                         }
2262                         conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
2263                     }
2264                     if (!conv_layer)
2265                         continue;
2266                     std::vector<UMat> umat_outputBlobs;
2267                     umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2268                     umat_output = umat_outputBlobs[0];
2269                 }
2270
2271                 // TODO: in general, this optimization can always be done, but
2272                 // many layers currently check that the input/output blobs are
2273                 // continuous arrays. Unfortunately, this is not true when
2274                 // the concatenation optimization is applied with batch_size > 1.
2275                 // so, for now, we only apply this optimization in the most popular
2276                 // case batch_size == 1.
2277                 if( output.dims == 4 && output.size[0] == 1 )
2278                 {
2279                     size_t i, ninputs = ld.inputBlobsId.size();
2280                     std::vector<LayerPin> realinputs(ninputs);
2281                     for( i = 0; i < ninputs; i++ )
2282                     {
2283                         LayerPin pin = ld.inputBlobsId[i];
2284                         LayerData* inp_i_data = &layers[pin.lid];
2285                         while(inp_i_data->skip &&
2286                               inp_i_data->inputBlobsId.size() == 1 &&
2287                               inp_i_data->consumers.size() == 1)
2288                         {
2289                             pin = inp_i_data->inputBlobsId[0];
2290                             inp_i_data = &layers[pin.lid];
2291                         }
2292                         printf_(("\treal input for %s is %s\n",
2293                                layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
2294                                inp_i_data->getLayerInstance()->name.c_str()));
2295
2296                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
2297                             break;
2298                         realinputs[i] = pin;
2299                     }
2300
2301                     if( i >= ninputs )
2302                     {
2303                         // Allocate new memory to prevent collisions during memory
2304                         // reusing (see https://github.com/opencv/opencv/pull/10456).
2305                         output = output.clone();
2306                         if (preferableBackend == DNN_BACKEND_OPENCV &&
2307                             IS_DNN_OPENCL_TARGET(preferableTarget))
2308                         {
2309                             std::vector<UMat> umats(1);
2310                             umat_output = umat_output.clone();
2311                             umats[0] = umat_output;
2312                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
2313                         }
2314                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
2315                         int ofs = 0;
2316                         for( i = 0; i < ninputs; i++ )
2317                         {
2318                             LayerPin pin = realinputs[i];
2319                             LayerData* inp_i_data = &layers[pin.lid];
2320                             int channels_i = ld.inputBlobs[i]->size[1];
2321                             chrange[1] = Range(ofs, ofs + channels_i);
2322                             printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
2323                                    pin.oid, ofs, ofs + channels_i));
2324                             ofs += channels_i;
2325                             Mat output_slice = output(chrange);
2326                             Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
2327                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
2328                             Mat* oldPtr = &curr_output;
2329                             curr_output = output_slice;
2330                             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2331                             {
2332                                 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
2333                                 umats[pin.oid] = umat_output(chrange);
2334                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
2335                             }
2336                             // Layers that refer old input Mat will refer to the
2337                             // new data but the same Mat object.
2338                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
2339                         }
2340                         ld.skip = true;
2341                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
2342                     }
2343                 }
2344             }
2345         }
2346     }
2347
2348     void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
2349     {
2350         CV_TRACE_FUNCTION();
2351
2352         MapIdToLayerData::iterator it;
2353         for (it = layers.begin(); it != layers.end(); it++)
2354             it->second.flag = 0;
2355
2356         CV_Assert(!layers[0].outputBlobs.empty());
2357         ShapesVec inputShapes;
2358         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
2359         {
2360             Mat& inp = layers[0].outputBlobs[i];
2361             CV_Assert(inp.total());
2362             if (preferableBackend == DNN_BACKEND_OPENCV &&
2363                 preferableTarget == DNN_TARGET_OPENCL_FP16)
2364             {
2365                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
2366             }
2367             inputShapes.push_back(shape(inp));
2368         }
2369         LayersShapesMap layersShapes;
2370         getLayersShapes(inputShapes, layersShapes);
2371
2372         blobManager.reset();
2373         backendWrappers.clear();
2374
2375         for(auto& layer : layers)
2376         {
2377             auto& ld = layer.second;
2378             ld.inputBlobsWrappers.clear();
2379             ld.outputBlobsWrappers.clear();
2380             ld.internalBlobsWrappers.clear();
2381         }
2382
2383         // Fake references to input blobs.
2384         for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
2385             blobManager.addReference(LayerPin(0, i));
2386         for (it = layers.begin(); it != layers.end(); ++it)
2387         {
2388             const LayerData& ld = it->second;
2389             blobManager.addReferences(ld.inputBlobsId);
2390         }
2391
2392         for (int i = 0; i < blobsToKeep_.size(); i++)
2393         {
2394             blobManager.addReference(blobsToKeep_[i]);
2395         }
2396
2397         for (it = layers.begin(); it != layers.end(); it++)
2398         {
2399             int lid = it->first;
2400             allocateLayer(lid, layersShapes);
2401         }
2402
2403         layersTimings.resize(lastLayerId + 1, 0);
2404         fuseLayers(blobsToKeep_);
2405     }
2406
2407     void forwardLayer(LayerData &ld)
2408     {
2409         CV_TRACE_FUNCTION();
2410
2411         Ptr<Layer> layer = ld.layerInstance;
2412
2413         TickMeter tm;
2414         tm.start();
2415
2416         if( !ld.skip )
2417         {
2418             std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
2419             if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
2420             {
2421                 if (isAsync)
2422                     CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
2423
2424                 if (!layer->supportBackend(DNN_BACKEND_OPENCV))
2425                     CV_Error(Error::StsNotImplemented, format("Layer \"%s\" of type \"%s\" unsupported on OpenCV backend",
2426                                                        ld.name.c_str(), ld.type.c_str()));
2427
2428                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2429                 {
2430                     std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
2431                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2432                     std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
2433                     layer->forward(umat_inputBlobs,
2434                                    umat_outputBlobs,
2435                                    umat_internalBlobs);
2436                     if (DNN_CHECK_NAN_INF)
2437                     {
2438                         bool fail = false;
2439                         for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2440                         {
2441                             UMat& u = umat_outputBlobs[i];
2442                             Mat m;
2443                             if (u.depth() == CV_16S) // FP16
2444                                 convertFp16(u, m);
2445                             else
2446                                 m = u.getMat(ACCESS_READ);
2447                             if (!checkRange(m))
2448                             {
2449                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2450                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2451                                 fail = true;
2452                             }
2453                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2454                             {
2455                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2456                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2457                                 fail = true;
2458                             }
2459                         }
2460                         if (fail)
2461                         {
2462                             for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
2463                             {
2464                                 UMat& u = umat_inputBlobs[i];
2465                                 Mat m;
2466                                 if (u.depth() == CV_16S) // FP16
2467                                     convertFp16(u, m);
2468                                 else
2469                                     m = u.getMat(ACCESS_READ);
2470                                 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2471                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2472                             }
2473                             for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2474                             {
2475                                 UMat& u = umat_outputBlobs[i];
2476                                 Mat m;
2477                                 if (u.depth() == CV_16S) // FP16
2478                                     convertFp16(u, m);
2479                                 else
2480                                     m = u.getMat(ACCESS_READ);
2481                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2482                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2483                             }
2484                             for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
2485                             {
2486                                 UMat& u = umat_internalBlobs[i];
2487                                 Mat m;
2488                                 if (u.depth() == CV_16S) // FP16
2489                                     convertFp16(u, m);
2490                                 else
2491                                     m = u.getMat(ACCESS_READ);
2492                                 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
2493                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
2494                             }
2495                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2496                                 CV_Assert(!fail);
2497                         }
2498                     }
2499                     OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
2500                 }
2501                 else
2502                 {
2503                     for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
2504                     {
2505                         if (!ld.inputBlobsWrappers[i].empty())
2506                             ld.inputBlobsWrappers[i]->copyToHost();
2507                     }
2508
2509                     std::vector<Mat> inps(ld.inputBlobs.size());
2510                     for (int i = 0; i < ld.inputBlobs.size(); ++i)
2511                     {
2512                         inps[i] = *ld.inputBlobs[i];
2513                     }
2514                     layer->forward(inps, ld.outputBlobs, ld.internals);
2515
2516                     if (DNN_CHECK_NAN_INF)
2517                     {
2518                         bool fail = false;
2519                         for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2520                         {
2521                             const Mat& m = ld.outputBlobs[i];
2522                             if (!checkRange(m))
2523                             {
2524                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2525                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2526                                 fail = true;
2527                             }
2528                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2529                             {
2530                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2531                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2532                                 fail = true;
2533                             }
2534                         }
2535                         if (fail)
2536                         {
2537                             for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
2538                             {
2539                                 const Mat* pM = ld.inputBlobs[i];
2540                                 if (!pM)
2541                                 {
2542                                     std::cout << "INPUT " << i << " is NULL" << std::endl;
2543                                     continue;
2544                                 }
2545                                 const Mat& m = *pM;
2546                                 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2547                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2548                             }
2549                             for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2550                             {
2551                                 const Mat& m = ld.outputBlobs[i];
2552                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2553                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2554                             }
2555                             for (size_t i = 0; i < ld.internals.size(); ++i)
2556                             {
2557                                 const Mat& m = ld.internals[i];
2558                                 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2559                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2560                             }
2561                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2562                                 CV_Assert(!fail);
2563                         }
2564                     }
2565
2566                     for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
2567                     {
2568                         if (!ld.outputBlobsWrappers[i].empty())
2569                             ld.outputBlobsWrappers[i]->setHostDirty();
2570                     }
2571                 }
2572             }
2573             else
2574             {
2575                 Ptr<BackendNode> node = it->second;
2576                 CV_Assert(!node.empty());
2577                 if (preferableBackend == DNN_BACKEND_CUDA)
2578                 {
2579                     CV_Assert(haveCUDA());
2580
2581 #ifdef HAVE_CUDA
2582                     Ptr<CUDABackendNode> cudaNode = node.dynamicCast<CUDABackendNode>();
2583                     CV_Assert(!cudaNode.empty());
2584
2585                     cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
2586 #endif
2587                 }
2588                 else if (preferableBackend == DNN_BACKEND_HALIDE)
2589                 {
2590                     forwardHalide(ld.outputBlobsWrappers, node);
2591                 }
2592                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
2593                 {
2594                     forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
2595                 }
2596                 else if (preferableBackend == DNN_BACKEND_VKCOM)
2597                 {
2598                     try
2599                     {
2600                         forwardVkCom(ld.outputBlobsWrappers, node);
2601                     }
2602                     catch (const cv::Exception& e)
2603                     {
2604                         CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what());
2605                         it->second = Ptr<BackendNode>();
2606                         forwardLayer(ld);
2607                     }
2608                 }
2609                 else
2610                 {
2611                     CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
2612                 }
2613             }
2614         }
2615         else
2616             tm.reset();
2617
2618         tm.stop();
2619         layersTimings[ld.id] = tm.getTimeTicks();
2620
2621         ld.flag = 1;
2622     }
2623
2624     void forwardToLayer(LayerData &ld, bool clearFlags = true)
2625     {
2626         CV_TRACE_FUNCTION();
2627
2628         if (clearFlags)
2629         {
2630             MapIdToLayerData::iterator it;
2631             for (it = layers.begin(); it != layers.end(); it++)
2632                 it->second.flag = 0;
2633         }
2634
2635         //already was forwarded
2636         if (ld.flag)
2637             return;
2638
2639         //forward parents
2640         MapIdToLayerData::iterator it;
2641         for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
2642         {
2643             LayerData &ld = it->second;
2644             if (ld.flag)
2645                 continue;
2646             forwardLayer(ld);
2647         }
2648
2649         //forward itself
2650         forwardLayer(ld);
2651
2652 #ifdef HAVE_CUDA
2653         if (preferableBackend == DNN_BACKEND_CUDA)
2654             cudaInfo->context.stream.synchronize();
2655 #endif
2656     }
2657
2658     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
2659     {
2660         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
2661
2662         if (id == 0 && inOutShapes[id].in[0].empty())
2663         {
2664             if (!layers[0].outputBlobs.empty())
2665             {
2666                 ShapesVec shapes;
2667                 for (int i = 0; i < layers[0].outputBlobs.size(); i++)
2668                 {
2669                     Mat& inp = layers[0].outputBlobs[i];
2670                     CV_Assert(inp.total());
2671                     shapes.push_back(shape(inp));
2672                 }
2673                 inOutShapes[0].in = shapes;
2674             }
2675             else
2676             {
2677                 inOutShapes[0].out.clear();
2678                 return;
2679             }
2680         }
2681
2682         if (inOutShapes[id].in.empty())
2683         {
2684             for(int i = 0; i < inputLayerIds.size(); i++)
2685             {
2686                 int layerId = inputLayerIds[i].lid;
2687                 LayersShapesMap::iterator it =
2688                         inOutShapes.find(layerId);
2689                 if(it == inOutShapes.end() ||
2690                         it->second.out.empty())
2691                 {
2692                     getLayerShapesRecursively(layerId, inOutShapes);
2693                 }
2694                 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
2695                 inOutShapes[id].in.push_back(shape);
2696             }
2697         }
2698         const ShapesVec& is = inOutShapes[id].in;
2699         ShapesVec& os = inOutShapes[id].out;
2700         ShapesVec& ints = inOutShapes[id].internal;
2701         int requiredOutputs = layers[id].requiredOutputs.size();
2702         inOutShapes[id].supportInPlace =
2703                 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
2704
2705         for (int i = 0; i < ints.size(); i++)
2706             CV_Assert(total(ints[i]) > 0);
2707
2708         for (int i = 0; i < os.size(); i++)
2709             CV_Assert(total(os[i]) > 0);
2710     }
2711
2712     void getLayersShapes(const ShapesVec& netInputShapes,
2713                          LayersShapesMap& inOutShapes)
2714     {
2715         inOutShapes.clear();
2716
2717         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2718         for (MapIdToLayerData::iterator it = layers.begin();
2719              it != layers.end(); it++)
2720         {
2721             getLayerShapesRecursively(it->first, inOutShapes);
2722         }
2723     }
2724
2725     void getLayerShapes(const ShapesVec& netInputShapes,
2726                         const int layerId,
2727                         LayerShapes& shapes)
2728     {
2729         LayersShapesMap inOutShapes;
2730         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2731         getLayerShapesRecursively(layerId, inOutShapes);
2732         shapes = inOutShapes[layerId];
2733     }
2734
2735     LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
2736     {
2737         return *std::max_element(pins.begin(), pins.end());
2738     }
2739
2740     Mat getBlob(const LayerPin& pin)
2741     {
2742         CV_TRACE_FUNCTION();
2743
2744         if (!pin.valid())
2745             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2746
2747         LayerData &ld = layers[pin.lid];
2748         if ((size_t)pin.oid >= ld.outputBlobs.size())
2749         {
2750             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %zu outputs, "
2751                                            "the #%d was requested", ld.name.c_str(),
2752                                            ld.outputBlobs.size(), pin.oid));
2753         }
2754         if (preferableTarget != DNN_TARGET_CPU)
2755         {
2756             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2757             // Transfer data to CPU if it's require.
2758             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2759         }
2760
2761         if (ld.outputBlobs[pin.oid].depth() == CV_16S)
2762         {
2763             convertFp16(ld.outputBlobs[pin.oid], output_blob);
2764             return output_blob;
2765         }
2766         else
2767             return ld.outputBlobs[pin.oid];
2768     }
2769
2770     Mat getBlob(String outputName)
2771     {
2772         return getBlob(getPinByAlias(outputName));
2773     }
2774
2775 #ifdef CV_CXX11
2776     AsyncArray getBlobAsync(const LayerPin& pin)
2777     {
2778         CV_TRACE_FUNCTION();
2779 #ifdef HAVE_INF_ENGINE
2780         if (!pin.valid())
2781             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2782
2783         LayerData &ld = layers[pin.lid];
2784         if ((size_t)pin.oid >= ld.outputBlobs.size())
2785         {
2786             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
2787                                            "the #%d was requested", ld.name.c_str(),
2788                                            (int)ld.outputBlobs.size(), (int)pin.oid));
2789         }
2790         if (preferableTarget != DNN_TARGET_CPU)
2791         {
2792             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2793             // Transfer data to CPU if it's require.
2794             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2795         }
2796         CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
2797
2798         Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
2799         return std::move(wrapper->futureMat);
2800 #else
2801         CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
2802 #endif
2803     }
2804
2805     AsyncArray getBlobAsync(String outputName)
2806     {
2807         return getBlobAsync(getPinByAlias(outputName));
2808     }
2809 #endif  // CV_CXX11
2810 };
2811
2812 Net::Net() : impl(new Net::Impl)
2813 {
2814 }
2815
2816 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
2817 {
2818 #ifndef HAVE_INF_ENGINE
2819     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
2820 #else
2821     InferenceEngine::CNNNetReader reader;
2822     reader.ReadNetwork(xml);
2823     reader.ReadWeights(bin);
2824
2825     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
2826
2827     std::vector<String> inputsNames;
2828     std::vector<MatShape> inp_shapes;
2829     for (auto& it : ieNet.getInputsInfo())
2830     {
2831         inputsNames.push_back(it.first);
2832         std::vector<size_t> dims = it.second->getTensorDesc().getDims();
2833         inp_shapes.push_back(std::vector<int>(dims.begin(), dims.end()));
2834     }
2835
2836     Net cvNet;
2837     cvNet.setInputsNames(inputsNames);
2838
2839     // set empty input to determine input shapes
2840     for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
2841     {
2842         cvNet.setInput(Mat(inp_shapes[inp_id], CV_32F), inputsNames[inp_id]);
2843     }
2844
2845     Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
2846     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
2847     for (auto& it : ieNet.getOutputsInfo())
2848     {
2849         Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
2850         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
2851         CV_Assert(ieLayer);
2852
2853         LayerParams lp;
2854         int lid = cvNet.addLayer(it.first, "", lp);
2855
2856         LayerData& ld = cvNet.impl->layers[lid];
2857         cvLayer->name = it.first;
2858         cvLayer->type = ieLayer->type;
2859         ld.layerInstance = cvLayer;
2860         ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
2861
2862         for (int i = 0; i < inputsNames.size(); ++i)
2863             cvNet.connect(0, i, lid, i);
2864     }
2865     cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
2866
2867     cvNet.impl->skipInfEngineInit = true;
2868     return cvNet;
2869 #endif  // HAVE_INF_ENGINE
2870 }
2871
2872 Net::~Net()
2873 {
2874 }
2875
2876 int Net::addLayer(const String &name, const String &type, LayerParams &params)
2877 {
2878     CV_TRACE_FUNCTION();
2879
2880     if (impl->getLayerId(name) >= 0)
2881     {
2882         CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
2883         return -1;
2884     }
2885
2886     int id = ++impl->lastLayerId;
2887     impl->layerNameToId.insert(std::make_pair(name, id));
2888     impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
2889
2890     return id;
2891 }
2892
2893 int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
2894 {
2895     CV_TRACE_FUNCTION();
2896
2897     int prvLid = impl->lastLayerId;
2898     int newLid = this->addLayer(name, type, params);
2899     this->connect(prvLid, 0, newLid, 0);
2900     return newLid;
2901 }
2902
2903 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
2904 {
2905     CV_TRACE_FUNCTION();
2906
2907     impl->connect(outLayerId, outNum, inpLayerId, inpNum);
2908 }
2909
2910 void Net::connect(String _outPin, String _inPin)
2911 {
2912     CV_TRACE_FUNCTION();
2913
2914     LayerPin outPin = impl->getPinByAlias(_outPin);
2915     LayerPin inpPin = impl->getPinByAlias(_inPin);
2916
2917     CV_Assert(outPin.valid() && inpPin.valid());
2918
2919     impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
2920 }
2921
2922 Mat Net::forward(const String& outputName)
2923 {
2924     CV_TRACE_FUNCTION();
2925
2926     String layerName = outputName;
2927
2928     if (layerName.empty())
2929         layerName = getLayerNames().back();
2930
2931     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2932     impl->setUpNet(pins);
2933     impl->forwardToLayer(impl->getLayerData(layerName));
2934
2935     return impl->getBlob(layerName);
2936 }
2937
2938 AsyncArray Net::forwardAsync(const String& outputName)
2939 {
2940     CV_TRACE_FUNCTION();
2941 #ifdef CV_CXX11
2942     String layerName = outputName;
2943
2944     if (layerName.empty())
2945         layerName = getLayerNames().back();
2946
2947     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2948     impl->setUpNet(pins);
2949
2950     if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
2951         CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
2952
2953     impl->isAsync = true;
2954     impl->forwardToLayer(impl->getLayerData(layerName));
2955     impl->isAsync = false;
2956
2957     return impl->getBlobAsync(layerName);
2958 #else
2959     CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
2960 #endif  // CV_CXX11
2961 }
2962
2963 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
2964 {
2965     CV_TRACE_FUNCTION();
2966
2967     String layerName = outputName;
2968
2969     if (layerName.empty())
2970         layerName = getLayerNames().back();
2971
2972     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2973     impl->setUpNet(pins);
2974     impl->forwardToLayer(impl->getLayerData(layerName));
2975
2976     LayerPin pin = impl->getPinByAlias(layerName);
2977     LayerData &ld = impl->layers[pin.lid];
2978
2979     if (outputBlobs.isUMat())
2980     {
2981         impl->getBlob(layerName).copyTo(outputBlobs);
2982     }
2983     else if (outputBlobs.isMat())
2984     {
2985         outputBlobs.assign(impl->getBlob(layerName));
2986     }
2987     else if (outputBlobs.isMatVector())
2988     {
2989         if (impl->preferableTarget != DNN_TARGET_CPU)
2990         {
2991             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2992             {
2993                 CV_Assert(!ld.outputBlobsWrappers[i].empty());
2994                 ld.outputBlobsWrappers[i]->copyToHost();
2995             }
2996         }
2997         if (ld.outputBlobs[0].depth() == CV_32F)
2998         {
2999             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
3000             outputvec = ld.outputBlobs;
3001         } else {
3002             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
3003             outputvec.resize(ld.outputBlobs.size());
3004             for (int i = 0; i < outputvec.size(); i++)
3005                 convertFp16(ld.outputBlobs[i], outputvec[i]);
3006         }
3007     }
3008     else if (outputBlobs.isUMatVector())
3009     {
3010         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
3011
3012         if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
3013             IS_DNN_OPENCL_TARGET(impl->preferableTarget))
3014         {
3015             if (impl->preferableTarget == DNN_TARGET_OPENCL)
3016                 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
3017             else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
3018             {
3019                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
3020                 outputvec.resize(out_vec.size());
3021                 for (int i = 0; i < out_vec.size(); i++)
3022                     convertFp16(out_vec[i], outputvec[i]);
3023             }
3024         }
3025         else
3026         {
3027             outputvec.resize(ld.outputBlobs.size());
3028             for (int i = 0; i < outputvec.size(); ++i)
3029                 ld.outputBlobs[i].copyTo(outputvec[i]);
3030         }
3031     }
3032 }
3033
3034 void Net::forward(OutputArrayOfArrays outputBlobs,
3035                   const std::vector<String>& outBlobNames)
3036 {
3037     CV_TRACE_FUNCTION();
3038
3039     std::vector<LayerPin> pins;
3040     for (int i = 0; i < outBlobNames.size(); i++)
3041     {
3042         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
3043     }
3044
3045     impl->setUpNet(pins);
3046
3047     LayerPin out = impl->getLatestLayerPin(pins);
3048
3049     impl->forwardToLayer(impl->getLayerData(out.lid));
3050
3051     std::vector<Mat> matvec;
3052     for (int i = 0; i < pins.size(); i++)
3053     {
3054         matvec.push_back(impl->getBlob(pins[i]));
3055     }
3056
3057     std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
3058     outputvec = matvec;
3059 }
3060
3061 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
3062                      const std::vector<String>& outBlobNames)
3063 {
3064     CV_TRACE_FUNCTION();
3065
3066     std::vector<LayerPin> pins;
3067     for (int i = 0; i < outBlobNames.size(); i++)
3068     {
3069         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
3070     }
3071
3072     impl->setUpNet(pins);
3073
3074     LayerPin out = impl->getLatestLayerPin(pins);
3075
3076     impl->forwardToLayer(impl->getLayerData(out.lid));
3077
3078     outputBlobs.resize(outBlobNames.size());
3079     for (int i = 0; i < outBlobNames.size(); i++)
3080     {
3081         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
3082         outputBlobs[i].resize(lp.size());
3083         for (int j = 0; j < lp.size(); j++)
3084         {
3085             outputBlobs[i][j] = impl->getBlob(lp[j]);
3086         }
3087     }
3088 }
3089
3090 void Net::setPreferableBackend(int backendId)
3091 {
3092     CV_TRACE_FUNCTION();
3093     CV_TRACE_ARG(backendId);
3094
3095     if( impl->preferableBackend != backendId )
3096     {
3097         impl->preferableBackend = backendId;
3098         impl->netWasAllocated = false;
3099         impl->clear();
3100     }
3101 }
3102
3103 void Net::setPreferableTarget(int targetId)
3104 {
3105     CV_TRACE_FUNCTION();
3106     CV_TRACE_ARG(targetId);
3107
3108     if( impl->preferableTarget != targetId )
3109     {
3110         impl->preferableTarget = targetId;
3111         if (IS_DNN_OPENCL_TARGET(targetId))
3112         {
3113 #ifndef HAVE_OPENCL
3114 #ifdef HAVE_INF_ENGINE
3115             if (impl->preferableBackend == DNN_BACKEND_OPENCV)
3116 #else
3117             if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
3118                 impl->preferableBackend == DNN_BACKEND_OPENCV)
3119 #endif  // HAVE_INF_ENGINE
3120                 impl->preferableTarget = DNN_TARGET_CPU;
3121 #else
3122             bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
3123             if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
3124                 impl->preferableTarget = DNN_TARGET_OPENCL;
3125 #endif
3126         }
3127         impl->netWasAllocated = false;
3128         impl->clear();
3129     }
3130 }
3131
3132 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
3133 {
3134     CV_TRACE_FUNCTION();
3135
3136     impl->netInputLayer->setNames(inputBlobNames);
3137 }
3138
3139 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
3140 {
3141     CV_TRACE_FUNCTION();
3142     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3143
3144     LayerPin pin;
3145     pin.lid = 0;
3146     pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
3147
3148     if (!pin.valid())
3149         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
3150
3151     LayerData &ld = impl->layers[pin.lid];
3152     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
3153     ld.outputBlobs.resize(numInputs);
3154     ld.outputBlobsWrappers.resize(numInputs);
3155     impl->netInputLayer->inputsData.resize(numInputs);
3156     impl->netInputLayer->scaleFactors.resize(numInputs);
3157     impl->netInputLayer->means.resize(numInputs);
3158
3159     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
3160     Mat blob_ = blob.getMat();
3161     bool oldShape = prevShape == shape(blob_);
3162     if (oldShape)
3163     {
3164         blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
3165     }
3166     else
3167     {
3168         ld.outputBlobs[pin.oid] = blob_.clone();
3169         impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
3170     }
3171
3172     if (!ld.outputBlobsWrappers[pin.oid].empty())
3173     {
3174         ld.outputBlobsWrappers[pin.oid]->setHostDirty();
3175     }
3176     impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
3177     impl->netInputLayer->means[pin.oid] = mean;
3178     impl->netWasAllocated = impl->netWasAllocated && oldShape;
3179 }
3180
3181 Mat Net::getParam(LayerId layer, int numParam)
3182 {
3183     LayerData &ld = impl->getLayerData(layer);
3184     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3185     CV_Assert(numParam < (int)layerBlobs.size());
3186     return layerBlobs[numParam];
3187 }
3188
3189 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
3190 {
3191     LayerData &ld = impl->getLayerData(layer);
3192
3193     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3194     CV_Assert(numParam < (int)layerBlobs.size());
3195     //we don't make strong checks, use this function carefully
3196     layerBlobs[numParam] = blob;
3197 }
3198
3199 int Net::getLayerId(const String &layer)
3200 {
3201     return impl->getLayerId(layer);
3202 }
3203
3204 String parseLayerParams(const String& name, const LayerParams& lp) {
3205     DictValue param = lp.get(name);
3206     std::ostringstream out;
3207     out << name << " ";
3208     switch (param.size()) {
3209         case 1: out << ": "; break;
3210         case 2: out << "(HxW): "; break;
3211         case 3: out << "(DxHxW): "; break;
3212         default: CV_Error(Error::StsNotImplemented, format("Unsupported %s size = %d", name.c_str(), param.size()));
3213     }
3214     for (size_t i = 0; i < param.size() - 1; i++) {
3215         out << param.get<int>(i) << " x ";
3216     }
3217     out << param.get<int>(param.size() - 1) << "\\l";
3218     return out.str();
3219 }
3220
3221 String Net::dump()
3222 {
3223     CV_Assert(!empty());
3224
3225     if (impl->netInputLayer->inputsData.empty())
3226         CV_Error(Error::StsError, "Requested set input");
3227
3228     if (!impl->netWasAllocated)
3229         impl->setUpNet();
3230
3231     std::ostringstream out;
3232     std::map<int, LayerData>& map = impl->layers;
3233     int prefBackend = impl->preferableBackend;
3234     std::vector<std::vector<int> > skippedLayers;
3235     std::vector<int> skipId;
3236     std::vector<int> allLayers(map.size(), -1);
3237     int idPrev = -1;
3238     Ptr<BackendNode> prevNode;
3239     for (std::map<int, LayerData>::reverse_iterator rit = map.rbegin(); rit != map.rend(); ++rit)
3240     {
3241         std::map<int, Ptr<BackendNode> >::iterator itBackend = rit->second.backendNodes.find(prefBackend);
3242         if (prefBackend == DNN_BACKEND_OPENCV || itBackend == rit->second.backendNodes.end() ||
3243             itBackend->second.empty())
3244         {
3245                 if (rit->second.skip)
3246                     skipId.push_back(rit->first);
3247                 else if (!skipId.empty())
3248                 {
3249                     if (prefBackend == DNN_BACKEND_OPENCV || prevNode.empty())
3250                         skipId.push_back(rit->first);
3251                     else if (idPrev != -1)
3252                         skipId.push_back(idPrev);
3253
3254                     std::sort(skipId.begin(), skipId.end());
3255                     for (int i = 0; i < skipId.size(); i++) {
3256                         allLayers[skipId[i]] = skippedLayers.size();
3257                     }
3258                     skippedLayers.push_back(skipId);
3259                     skipId.clear();
3260                 }
3261         }
3262         else
3263         {
3264             if (itBackend->second == prevNode)
3265                 skipId.push_back(idPrev);
3266             else if (!skipId.empty())
3267             {
3268                 skipId.push_back(idPrev);
3269                 std::sort(skipId.begin(), skipId.end());
3270                 for (int i = 0; i < skipId.size(); i++) {
3271                     allLayers[skipId[i]] = skippedLayers.size();
3272                 }
3273                 skippedLayers.push_back(skipId);
3274                 skipId.clear();
3275             }
3276             idPrev = rit->first;
3277             prevNode = itBackend->second;
3278         }
3279     }
3280     String colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848"};
3281     String backend;
3282     switch (prefBackend) {
3283         case DNN_BACKEND_DEFAULT: backend = "DEFAULT/"; break;
3284         case DNN_BACKEND_HALIDE: backend = "HALIDE/"; break;
3285         case DNN_BACKEND_INFERENCE_ENGINE: backend = "DLIE/"; break;
3286         case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
3287         case DNN_BACKEND_CUDA: backend = "CUDA/"; break;
3288     }
3289     out << "digraph G {" << '\n';
3290     // Add nodes
3291     for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3292     {
3293         String name = it->second.params.name;
3294         if (allLayers[it->first] == -1 && !name.empty()) {
3295             out << "    " << "\"" << name << "\"" << " [label=\"";
3296             skipId.clear();
3297             skipId.push_back(it->first);
3298         }
3299         else if (name.empty() || it->first != skippedLayers[allLayers[it->first]][0])
3300             continue;
3301         else { // first node in cluster : it->first == skippedLayers[allLayers[it->first]][0]
3302             int cluster = allLayers[it->first];
3303             out << "    " << "\"" << "cluster_" << cluster << "\"" << " [label=\"{";
3304             skipId = skippedLayers[allLayers[it->first]]; // vertices in current cluster
3305         }
3306         for (int i = 0; i < skipId.size(); i++)
3307         {
3308             LayerParams& lp = map[skipId[i]].params;
3309             if (!lp.name.empty()) {
3310                 if (i > 0) {
3311                     out << " | ";
3312                 }
3313                 out << lp.name << "\\n" << lp.type << "\\n";
3314                 if (lp.has("kernel_size")) {
3315                     String kernel = parseLayerParams("kernel_size", lp);
3316                     out << kernel;
3317                 } else if (lp.has("kernel_h") && lp.has("kernel_w")) {
3318                     DictValue h = lp.get("kernel_h");
3319                     DictValue w = lp.get("kernel_w");
3320                     out << "kernel (HxW): " << h << " x " << w << "\\l";
3321                 }
3322                 if (lp.has("stride")) {
3323                     String stride = parseLayerParams("stride", lp);
3324                     out << stride;
3325                 } else if (lp.has("stride_h") && lp.has("stride_w")) {
3326                     DictValue h = lp.get("stride_h");
3327                     DictValue w = lp.get("stride_w");
3328                     out << "stride (HxW): " << h << " x " << w << "\\l";
3329                 }
3330                 if (lp.has("dilation")) {
3331                     String dilation = parseLayerParams("dilation", lp);
3332                     out << dilation;
3333                 } else if (lp.has("dilation_h") && lp.has("dilation_w")) {
3334                     DictValue h = lp.get("dilation_h");
3335                     DictValue w = lp.get("dilation_w");
3336                     out << "dilation (HxW): " << h << " x " << w << "\\l";
3337                 }
3338                 if (lp.has("pad")) {
3339                     DictValue pad = lp.get("pad");
3340                     out << "pad ";
3341                     switch (pad.size()) {
3342                         case 1: out << ": " << pad << "\\l"; break;
3343                         case 2: out << "(HxW): (" << pad.get<int>(0) << " x " << pad.get<int>(1) << ")" << "\\l"; break;
3344                         case 4: out << "(HxW): (" << pad.get<int>(0) << ", " << pad.get<int>(2) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(3) << ")" << "\\l"; break;
3345                         case 6: out << "(DxHxW): (" << pad.get<int>(0) << ", " << pad.get<int>(3) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(4)
3346                                 << ") x (" << pad.get<int>(2) << ", " << pad.get<int>(5) << ")" << "\\l"; break;
3347                         default: CV_Error(Error::StsNotImplemented,  format("Unsupported pad size = %d", pad.size()));
3348                     }
3349                  } else if (lp.has("pad_l") && lp.has("pad_t") && lp.has("pad_r") && lp.has("pad_b")) {
3350                      DictValue l = lp.get("pad_l");
3351                      DictValue t = lp.get("pad_t");
3352                      DictValue r = lp.get("pad_r");
3353                      DictValue b = lp.get("pad_b");
3354                      out << "pad (HxW): (" << t << ", " << b << ") x (" << l << ", " << r << ")" << "\\l";
3355                  }
3356                  else if (lp.has("pooled_w") || lp.has("pooled_h")) {
3357                      DictValue h = lp.get("pooled_h");
3358                      DictValue w = lp.get("pooled_w");
3359                      out << "pad (HxW): " << h << " x " << w << "\\l";
3360                  }
3361                  if (lp.has("pool")) {
3362                      out << "pool: " << lp.get("pool") << "\\l";
3363                  }
3364                  if (lp.has("global_pooling")) {
3365                      out << "global_pooling: " << lp.get("global_pooling") << "\\l";
3366                  }
3367                  if (lp.has("group")) {
3368                      out << "group: " << lp.get("group") << "\\l";
3369                  }
3370              }
3371          }
3372          if (!it->second.outputBlobs.empty())
3373              out << "output: " << it->second.outputBlobs[0].size << "\\l";
3374
3375          Ptr<BackendNode> layerBackend = it->second.backendNodes[prefBackend];
3376          out << (!layerBackend.empty() ? backend : "OCV/");
3377          int colorId = 0;
3378          switch (it->second.layerInstance->preferableTarget) {
3379              case DNN_TARGET_CPU: out << "CPU\\n"; colorId = layerBackend.empty() ? 0 : 5; break;
3380              case DNN_TARGET_OPENCL: out << "OCL\\n"; colorId = 1; break;
3381              case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16\\n"; colorId = 2; break;
3382              case DNN_TARGET_MYRIAD: out << "MYRIAD\\n"; colorId = 3; break;
3383              case DNN_TARGET_FPGA: out << "FPGA\\n"; colorId = 4; break;
3384              case DNN_TARGET_CUDA: out << "CUDA\\n"; colorId = 5; break;
3385              case DNN_TARGET_CUDA_FP16: out << "CUDA_FP16\\n"; colorId = 6; break;
3386          }
3387          out << ((skipId.size() == 1)? "\" " : " }\" ");
3388          out << "fillcolor=\"" << colors[colorId] << "\" ";
3389          out << "style=filled ";
3390          out << "shape=" << ((skipId.size() == 1)? "box" : "record") << "]" << '\n';
3391     }
3392     out << '\n';
3393     // Add edges
3394     int inputsSize = impl->netInputLayer->outNames.size();
3395     for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3396     {
3397         if (allLayers[it->first] == -1)  // node
3398         {
3399             for (int i = 0; i < it->second.consumers.size(); i++)
3400             {
3401                 int outId = it->second.consumers[i].lid;
3402                 if (it == map.begin() && inputsSize > 1)
3403                     out << "    " << "\"" << it->second.name << "_" << i << "\"" << " -> ";
3404                 else
3405                     out << "    " << "\"" << it->second.name << "\"" << " -> ";
3406                 if (allLayers[outId] == -1)  // node
3407                     out << "\"" << map[outId].name << "\"" << '\n';
3408                 else  // cluster
3409                     out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3410             }
3411         }
3412         else if (it->first == skippedLayers[allLayers[it->first]].back())  // edges from last layer in cluster
3413         {
3414             for (int i = 0; i < it->second.consumers.size(); i++)
3415             {
3416                 int outId = it->second.consumers[i].lid;
3417                 if (allLayers[outId] == -1) { // node
3418                     out << "    " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3419                     out << "\"" << map[outId].name << "\"" << '\n';
3420                 }
3421                 else if (allLayers[outId] != allLayers[it->first]) { // another cluster
3422                     out << "    " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3423                     out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3424                 }
3425             }
3426         }
3427     }
3428     out << "}";
3429     return out.str();
3430 }
3431
3432 void Net::dumpToFile(const String& path) {
3433     std::ofstream file(path.c_str());
3434     file << dump();
3435     file.close();
3436 }
3437
3438 Ptr<Layer> Net::getLayer(LayerId layerId)
3439 {
3440     LayerData &ld = impl->getLayerData(layerId);
3441     return ld.getLayerInstance();
3442 }
3443
3444 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
3445 {
3446     LayerData &ld = impl->getLayerData(layerId);
3447     if (!ld.layerInstance)
3448         CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str()));
3449
3450     std::vector<Ptr<Layer> > inputLayers;
3451     inputLayers.reserve(ld.inputLayersId.size());
3452     std::set<int>::iterator it;
3453     for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) {
3454         inputLayers.push_back(getLayer(*it));
3455     }
3456     return inputLayers;
3457 }
3458
3459 std::vector<String> Net::getLayerNames() const
3460 {
3461     std::vector<String> res;
3462     res.reserve(impl->layers.size());
3463
3464     Impl::MapIdToLayerData::iterator it;
3465     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3466     {
3467         if (it->second.id) //skip Data layer
3468             res.push_back(it->second.name);
3469     }
3470
3471     return res;
3472 }
3473
3474 bool Net::empty() const
3475 {
3476     return impl->layers.size() <= 1; //first layer is default Data layer
3477 }
3478
3479 std::vector<int> Net::getUnconnectedOutLayers() const
3480 {
3481     std::vector<int> layersIds;
3482
3483     Impl::MapIdToLayerData::iterator it;
3484     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3485     {
3486         int lid = it->first;
3487         LayerData &ld = it->second;
3488
3489         if (ld.requiredOutputs.size() == 0)
3490             layersIds.push_back(lid);
3491     }
3492
3493     return layersIds;
3494 }
3495
3496 std::vector<String> Net::getUnconnectedOutLayersNames() const
3497 {
3498     std::vector<int> ids = getUnconnectedOutLayers();
3499     const size_t n = ids.size();
3500     std::vector<String> names(n);
3501     for (size_t i = 0; i < n; ++i)
3502     {
3503         names[i] = impl->layers[ids[i]].name;
3504     }
3505     return names;
3506 }
3507
3508 void Net::getLayersShapes(const ShapesVec& netInputShapes,
3509                           std::vector<int>& layersIds,
3510                           std::vector<ShapesVec>& inLayersShapes,
3511                           std::vector<ShapesVec>& outLayersShapes) const
3512 {
3513     layersIds.clear();
3514     inLayersShapes.clear();
3515     outLayersShapes.clear();
3516
3517     Impl::LayersShapesMap inOutShapes;
3518     impl->getLayersShapes(netInputShapes, inOutShapes);
3519
3520     for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
3521         it != inOutShapes.end(); it++)
3522     {
3523         layersIds.push_back(it->first);
3524         inLayersShapes.push_back(it->second.in);
3525         outLayersShapes.push_back(it->second.out);
3526     }
3527 }
3528
3529 void Net::getLayersShapes(const MatShape& netInputShape,
3530                           std::vector<int>& layerIds,
3531                           std::vector<ShapesVec>& inLayersShapes,
3532                           std::vector<ShapesVec>& outLayersShapes) const
3533 {
3534     getLayersShapes(ShapesVec(1, netInputShape),
3535                     layerIds, inLayersShapes, outLayersShapes);
3536 }
3537
3538 void Net::getLayerShapes(const MatShape& netInputShape,
3539                          const int layerId,
3540                          ShapesVec& inLayerShapes,
3541                          ShapesVec& outLayerShapes) const
3542 {
3543     getLayerShapes(ShapesVec(1, netInputShape),
3544                    layerId, inLayerShapes, outLayerShapes);
3545
3546 }
3547
3548 void Net::getLayerShapes(const ShapesVec& netInputShapes,
3549                     const int layerId,
3550                     ShapesVec& inLayerShapes,
3551                     ShapesVec& outLayerShapes) const
3552 {
3553     LayerShapes shapes;
3554     impl->getLayerShapes(netInputShapes, layerId, shapes);
3555     inLayerShapes = shapes.in;
3556     outLayerShapes = shapes.out;
3557 }
3558
3559 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
3560 {
3561     CV_TRACE_FUNCTION();
3562
3563     int64 flops = 0;
3564     std::vector<int> ids;
3565     std::vector<std::vector<MatShape> > inShapes, outShapes;
3566     getLayersShapes(netInputShapes, ids, inShapes, outShapes);
3567     CV_Assert(inShapes.size() == outShapes.size());
3568     CV_Assert(inShapes.size() == ids.size());
3569
3570     for(int i = 0; i < ids.size(); i++)
3571     {
3572         flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
3573                                                                    outShapes[i]);
3574     }
3575
3576     return flops;
3577 }
3578
3579 int64 Net::getFLOPS(const MatShape& netInputShape) const
3580 {
3581     return getFLOPS(std::vector<MatShape>(1, netInputShape));
3582 }
3583
3584 int64 Net::getFLOPS(const int layerId,
3585               const std::vector<MatShape>& netInputShapes) const
3586 {
3587     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3588     CV_Assert(layer != impl->layers.end());
3589
3590     LayerShapes shapes;
3591     impl->getLayerShapes(netInputShapes, layerId, shapes);
3592
3593     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
3594 }
3595
3596 int64 Net::getFLOPS(const int layerId,
3597               const MatShape& netInputShape) const
3598 {
3599     return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
3600 }
3601
3602 void Net::getLayerTypes(std::vector<String>& layersTypes) const
3603 {
3604     layersTypes.clear();
3605
3606     std::map<String, int> layers;
3607     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3608          it != impl->layers.end(); it++)
3609     {
3610         if (layers.find(it->second.type) == layers.end())
3611             layers[it->second.type] = 0;
3612         layers[it->second.type]++;
3613     }
3614
3615     for (std::map<String, int>::iterator it = layers.begin();
3616          it != layers.end(); it++)
3617     {
3618         layersTypes.push_back(it->first);
3619     }
3620 }
3621
3622 int Net::getLayersCount(const String& layerType) const
3623 {
3624     int count = 0;
3625     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3626          it != impl->layers.end(); it++)
3627     {
3628         if (it->second.type == layerType)
3629             count++;
3630     }
3631     return count;
3632 }
3633
3634 void Net::getMemoryConsumption(const int layerId,
3635                                const std::vector<MatShape>& netInputShapes,
3636                                size_t& weights, size_t& blobs) const
3637 {
3638     CV_TRACE_FUNCTION();
3639
3640     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3641     CV_Assert(layer != impl->layers.end());
3642
3643     weights = blobs = 0;
3644
3645     for(int i = 0; i < layer->second.params.blobs.size(); i++)
3646     {
3647         const Mat& weightsBlob = layer->second.params.blobs[i];
3648         weights += weightsBlob.total()*weightsBlob.elemSize();
3649     }
3650
3651     ShapesVec inLayerShapes, outLayerShapes;
3652     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
3653     for(int i = 0; i < outLayerShapes.size(); i++)
3654     {
3655         blobs += total(outLayerShapes[i]) * sizeof(float);
3656     }
3657 }
3658
3659 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3660                                size_t& weights, size_t& blobs) const
3661 {
3662     CV_TRACE_FUNCTION();
3663
3664     std::vector<int> layerIds;
3665     std::vector<size_t> w, b;
3666     getMemoryConsumption(netInputShapes, layerIds, w, b);
3667
3668     weights = blobs = 0;
3669     for(int i = 0; i < layerIds.size(); i++)
3670     {
3671         weights += w[i];
3672         blobs += b[i];
3673     }
3674 }
3675
3676 void Net::getMemoryConsumption(const int layerId,
3677                                const MatShape& netInputShape,
3678                                size_t& weights, size_t& blobs) const
3679 {
3680     getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
3681                          weights, blobs);
3682 }
3683
3684 void Net::getMemoryConsumption(const MatShape& netInputShape,
3685                                size_t& weights, size_t& blobs) const
3686 {
3687     getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
3688                          weights, blobs);
3689 }
3690
3691 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3692                                   std::vector<int>& layerIds, std::vector<size_t>& weights,
3693                                   std::vector<size_t>& blobs) const
3694 {
3695     CV_TRACE_FUNCTION();
3696
3697     layerIds.clear();
3698     weights.clear();
3699     blobs.clear();
3700
3701     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
3702
3703     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
3704
3705     for(int i = 0; i < layerIds.size(); i++)
3706     {
3707         int w = 0, b = 0;
3708         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
3709         CV_Assert(layer != impl->layers.end());
3710
3711         for(int j = 0; j < layer->second.params.blobs.size(); j++)
3712         {
3713             const Mat& weightsBlob = layer->second.params.blobs[j];
3714             w += weightsBlob.total()*weightsBlob.elemSize();
3715         }
3716
3717         for(int j = 0; j < outLayerShapes[i].size(); j++)
3718         {
3719             b += total(outLayerShapes[i][j]) * sizeof(float);
3720         }
3721
3722         weights.push_back(w);
3723         blobs.push_back(b);
3724     }
3725 }
3726
3727 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
3728                                std::vector<size_t>& weights, std::vector<size_t>& blobs) const
3729 {
3730     getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
3731                          weights, blobs);
3732 }
3733
3734 void Net::enableFusion(bool fusion)
3735 {
3736     if( impl->fusion != fusion )
3737     {
3738         impl->fusion = fusion;
3739         impl->netWasAllocated = false;
3740         impl->clear();
3741     }
3742 }
3743
3744 void Net::setHalideScheduler(const String& scheduler)
3745 {
3746     CV_TRACE_FUNCTION();
3747     CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
3748
3749     impl->halideConfigFile = scheduler;
3750 }
3751
3752 int64 Net::getPerfProfile(std::vector<double>& timings)
3753 {
3754     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
3755     int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
3756     return total;
3757 }
3758
3759 //////////////////////////////////////////////////////////////////////////
3760
3761 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
3762
3763 Layer::Layer(const LayerParams &params)
3764     : blobs(params.blobs), name(params.name), type(params.type)
3765 {
3766     preferableTarget = DNN_TARGET_CPU;
3767 }
3768
3769 void Layer::setParamsFrom(const LayerParams &params)
3770 {
3771     blobs = params.blobs;
3772     name = params.name;
3773     type = params.type;
3774 }
3775
3776 int Layer::inputNameToIndex(String)
3777 {
3778     return -1;
3779 }
3780
3781 int Layer::outputNameToIndex(const String&)
3782 {
3783     return 0;
3784 }
3785
3786 bool Layer::supportBackend(int backendId)
3787 {
3788     return backendId == DNN_BACKEND_OPENCV;
3789 }
3790
3791 Ptr<BackendNode> Layer::initCUDA(
3792     void*,
3793     const std::vector<Ptr<BackendWrapper>>&,
3794     const std::vector<Ptr<BackendWrapper>>&)
3795 {
3796     CV_Error(Error::StsNotImplemented, "CUDA pipeline of " + type +
3797                                        " layers is not defined.");
3798     return Ptr<BackendNode>();
3799 }
3800
3801 Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
3802 {
3803     CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
3804                                        " layers is not defined.");
3805     return Ptr<BackendNode>();
3806 }
3807
3808 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
3809 {
3810     CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
3811                                        " layers is not defined.");
3812     return Ptr<BackendNode>();
3813 }
3814
3815 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
3816 {
3817     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
3818                                        " layers is not defined.");
3819     return Ptr<BackendNode>();
3820 }
3821
3822 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
3823                                  const std::vector<Mat> &outputs, int targetId) const
3824 {
3825 #ifdef  HAVE_HALIDE
3826     CV_TRACE_FUNCTION();
3827
3828     Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
3829                 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
3830     Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
3831
3832     int outW, outH, outC, outN;
3833     getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
3834
3835     if (targetId == DNN_TARGET_CPU)
3836     {
3837         if (outW == 1 && outH == 1)
3838         {
3839             if (outC + outN == 1)
3840                 return;
3841
3842             if (outC > 8)
3843               top.split(c, co, ci, 8)
3844                  .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3845                  .parallel(tile)
3846                  .vectorize(ci, 8);
3847             else
3848               top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
3849                  .parallel(tile);
3850         }
3851         else
3852         {
3853             if (outH > 2)
3854             {
3855                 top.reorder(x, c, y)
3856                    .split(y, yo, yi, 2)
3857                    .fuse(yo, n, tile)
3858                    .parallel(tile)
3859                    .unroll(yi)
3860                    .vectorize(x, outW >= 16 ? 16 : outW);
3861             }
3862         }
3863     }
3864     else if (targetId == DNN_TARGET_OPENCL)
3865     {
3866         if (outW == 1 && outH == 1)
3867         {
3868             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
3869             top.split(c, co, ci, c_split)
3870                .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3871                .gpu_blocks(tile)
3872                .gpu_threads(ci);
3873         }
3874         else
3875         {
3876             int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
3877             int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
3878             // Supported vectorization widths: 2, 3, 4, 8, 16
3879             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
3880             top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
3881                .split(c, co, ci, c_split)
3882                .gpu_blocks(xo, yo, co)
3883                .gpu_threads(xi, yi)
3884                .reorder(xi, yi, ci, xo, yo, co)
3885                .vectorize(ci);
3886         }
3887     }
3888     else
3889         CV_Error(Error::StsNotImplemented, "Unknown target identifier");
3890 #endif  // HAVE_HALIDE
3891 }
3892
3893 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
3894 {
3895     return Ptr<BackendNode>();
3896 }
3897
3898 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
3899 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
3900 void Layer::getScaleShift(Mat& scale, Mat& shift) const
3901 {
3902     scale = Mat();
3903     shift = Mat();
3904 }
3905
3906 void Layer::unsetAttached()
3907 {
3908     setActivation(Ptr<ActivationLayer>());
3909 }
3910
3911 template <typename T>
3912 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
3913 {
3914     pv.resize(v.size());
3915     for (size_t i = 0; i < v.size(); i++)
3916         pv[i] = const_cast<T*>(&v[i]);
3917 }
3918
3919 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
3920 {
3921     CV_TRACE_FUNCTION();
3922     this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
3923 }
3924
3925 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
3926 {
3927     CV_UNUSED(input);CV_UNUSED(output);
3928 }
3929
3930 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
3931 {
3932     CV_TRACE_FUNCTION();
3933     std::vector<Mat> inputs, outputs;
3934     inputs_arr.getMatVector(inputs);
3935     outputs_arr.getMatVector(outputs);
3936
3937     std::vector<Mat*> inputsp;
3938     vecToPVec(inputs, inputsp);
3939     this->finalize(inputsp, outputs);
3940 }
3941
3942 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
3943 {
3944     CV_TRACE_FUNCTION();
3945
3946     std::vector<Mat> outputs;
3947     this->finalize(inputs, outputs);
3948     return outputs;
3949 }
3950
3951 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
3952 {
3953     // We kept this method for compatibility. DNN calls it now only to support users' implementations.
3954 }
3955
3956 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3957 {
3958     CV_TRACE_FUNCTION();
3959     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3960
3961     Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
3962 }
3963
3964 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3965 {
3966     CV_TRACE_FUNCTION();
3967     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3968
3969     if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
3970     {
3971         std::vector<UMat> inputs;
3972         std::vector<UMat> outputs;
3973         std::vector<UMat> internals;
3974
3975         std::vector<UMat> orig_inputs;
3976         std::vector<UMat> orig_outputs;
3977         std::vector<UMat> orig_internals;
3978
3979         inputs_arr.getUMatVector(orig_inputs);
3980         outputs_arr.getUMatVector(orig_outputs);
3981         internals_arr.getUMatVector(orig_internals);
3982
3983         inputs.resize(orig_inputs.size());
3984         for (size_t i = 0; i < orig_inputs.size(); i++)
3985             convertFp16(orig_inputs[i], inputs[i]);
3986
3987         outputs.resize(orig_outputs.size());
3988         for (size_t i = 0; i < orig_outputs.size(); i++)
3989             outputs[i].create(shape(orig_outputs[i]), CV_32F);
3990
3991         internals.resize(orig_internals.size());
3992         for (size_t i = 0; i < orig_internals.size(); i++)
3993             internals[i].create(shape(orig_internals[i]), CV_32F);
3994
3995         forward(inputs, outputs, internals);
3996
3997         for (size_t i = 0; i < outputs.size(); i++)
3998             convertFp16(outputs[i], orig_outputs[i]);
3999
4000         // sync results back
4001         outputs_arr.assign(orig_outputs);
4002         internals_arr.assign(orig_internals);
4003         return;
4004     }
4005     std::vector<Mat> inpvec;
4006     std::vector<Mat> outputs;
4007     std::vector<Mat> internals;
4008
4009     inputs_arr.getMatVector(inpvec);
4010     outputs_arr.getMatVector(outputs);
4011     internals_arr.getMatVector(internals);
4012
4013     std::vector<Mat*> inputs(inpvec.size());
4014     for (int i = 0; i < inpvec.size(); i++)
4015         inputs[i] = &inpvec[i];
4016
4017     this->forward(inputs, outputs, internals);
4018
4019     // sync results back
4020     outputs_arr.assign(outputs);
4021     internals_arr.assign(internals);
4022 }
4023
4024 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
4025 {
4026     CV_TRACE_FUNCTION();
4027
4028     this->finalize(inputs, outputs);
4029     this->forward(inputs, outputs, internals);
4030 }
4031
4032 Layer::~Layer() {}
4033
4034 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
4035                             const int requiredOutputs,
4036                             std::vector<MatShape> &outputs,
4037                             std::vector<MatShape> &internals) const
4038 {
4039     CV_Assert(inputs.size());
4040     outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
4041     return false;
4042 }
4043
4044 //////////////////////////////////////////////////////////////////////////
4045
4046 static Mutex& getLayerFactoryMutex()
4047 {
4048     static Mutex* volatile instance = NULL;
4049     if (instance == NULL)
4050     {
4051         cv::AutoLock lock(getInitializationMutex());
4052         if (instance == NULL)
4053             instance = new Mutex();
4054     }
4055     return *instance;
4056 }
4057
4058 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
4059
4060 static LayerFactory_Impl& getLayerFactoryImpl_()
4061 {
4062     static LayerFactory_Impl impl;
4063     return impl;
4064 }
4065
4066 static LayerFactory_Impl& getLayerFactoryImpl()
4067 {
4068     static LayerFactory_Impl* volatile instance = NULL;
4069     if (instance == NULL)
4070     {
4071         cv::AutoLock lock(getLayerFactoryMutex());
4072         if (instance == NULL)
4073         {
4074             instance = &getLayerFactoryImpl_();
4075             initializeLayerFactory();
4076         }
4077     }
4078     return *instance;
4079 }
4080
4081 void LayerFactory::registerLayer(const String &type, Constructor constructor)
4082 {
4083     CV_TRACE_FUNCTION();
4084     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
4085
4086     cv::AutoLock lock(getLayerFactoryMutex());
4087     String type_ = toLowerCase(type);
4088     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
4089
4090     if (it != getLayerFactoryImpl().end())
4091     {
4092         if (it->second.back() == constructor)
4093             CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered");
4094         it->second.push_back(constructor);
4095     }
4096     getLayerFactoryImpl().insert(std::make_pair(type_, std::vector<Constructor>(1, constructor)));
4097 }
4098
4099 void LayerFactory::unregisterLayer(const String &type)
4100 {
4101     CV_TRACE_FUNCTION();
4102     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
4103
4104     cv::AutoLock lock(getLayerFactoryMutex());
4105     String type_ = toLowerCase(type);
4106
4107     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
4108     if (it != getLayerFactoryImpl().end())
4109     {
4110         if (it->second.size() > 1)
4111             it->second.pop_back();
4112         else
4113             getLayerFactoryImpl().erase(it);
4114     }
4115 }
4116
4117 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
4118 {
4119     CV_TRACE_FUNCTION();
4120     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
4121
4122     cv::AutoLock lock(getLayerFactoryMutex());
4123     String type_ = toLowerCase(type);
4124     LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_);
4125
4126     if (it != getLayerFactoryImpl().end())
4127     {
4128         CV_Assert(!it->second.empty());
4129         return it->second.back()(params);
4130     }
4131     else
4132     {
4133         return Ptr<Layer>(); //NULL
4134     }
4135 }
4136
4137 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
4138
4139 BackendNode::~BackendNode() {};
4140
4141 BackendWrapper::BackendWrapper(int backendId, int targetId)
4142     : backendId(backendId), targetId(targetId) {}
4143
4144 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
4145 {
4146     CV_Error(Error::StsNotImplemented,
4147              "Constructor of backend wrapper must be implemented");
4148 }
4149
4150 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
4151 {
4152     CV_Error(Error::StsNotImplemented,
4153              "Constructor of backend wrapper must be implemented");
4154 }
4155
4156 BackendWrapper::~BackendWrapper() {}
4157
4158 Net readNet(const String& _model, const String& _config, const String& _framework)
4159 {
4160     String framework = toLowerCase(_framework);
4161     String model = _model;
4162     String config = _config;
4163     const std::string modelExt = model.substr(model.rfind('.') + 1);
4164     const std::string configExt = config.substr(config.rfind('.') + 1);
4165     if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
4166                                 modelExt == "prototxt" || configExt == "prototxt")
4167     {
4168         if (modelExt == "prototxt" || configExt == "caffemodel")
4169             std::swap(model, config);
4170         return readNetFromCaffe(config, model);
4171     }
4172     if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
4173                                      modelExt == "pbtxt" || configExt == "pbtxt")
4174     {
4175         if (modelExt == "pbtxt" || configExt == "pb")
4176             std::swap(model, config);
4177         return readNetFromTensorflow(model, config);
4178     }
4179     if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
4180                                 configExt == "t7" || configExt == "net")
4181     {
4182         return readNetFromTorch(model.empty() ? config : model);
4183     }
4184     if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
4185                                   modelExt == "cfg" || configExt == "cfg")
4186     {
4187         if (modelExt == "cfg" || configExt == "weights")
4188             std::swap(model, config);
4189         return readNetFromDarknet(config, model);
4190     }
4191     if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
4192                                modelExt == "xml" || configExt == "xml")
4193     {
4194         if (modelExt == "xml" || configExt == "bin")
4195             std::swap(model, config);
4196         return readNetFromModelOptimizer(config, model);
4197     }
4198     if (framework == "onnx" || modelExt == "onnx")
4199     {
4200         return readNetFromONNX(model);
4201     }
4202     CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
4203                                       model + (config.empty() ? "" : ", " + config));
4204 }
4205
4206 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
4207             const std::vector<uchar>& bufferConfig)
4208 {
4209     String framework = toLowerCase(_framework);
4210     if (framework == "caffe")
4211         return readNetFromCaffe(bufferConfig, bufferModel);
4212     else if (framework == "tensorflow")
4213         return readNetFromTensorflow(bufferModel, bufferConfig);
4214     else if (framework == "darknet")
4215         return readNetFromDarknet(bufferConfig, bufferModel);
4216     else if (framework == "torch")
4217         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
4218     else if (framework == "dldt")
4219         CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
4220     CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
4221 }
4222
4223 Net readNetFromModelOptimizer(const String &xml, const String &bin)
4224 {
4225     return Net::readFromModelOptimizer(xml, bin);
4226 }
4227
4228 CV__DNN_INLINE_NS_END
4229 }} // namespace