Fix a bug with OpenVINO backend
[platform/upstream/opencv.git] / modules / dnn / src / dnn.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of the copyright holders may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41
42 #include "precomp.hpp"
43 #include "op_halide.hpp"
44 #include "op_inf_engine.hpp"
45 #include "halide_scheduler.hpp"
46 #include <set>
47 #include <algorithm>
48 #include <iostream>
49 #include <sstream>
50 #include <iterator>
51 #include <numeric>
52 #include <opencv2/dnn/shape_utils.hpp>
53 #include <opencv2/imgproc.hpp>
54
55 #include <opencv2/core/utils/configuration.private.hpp>
56 #include <opencv2/core/utils/logger.hpp>
57
58 namespace cv {
59 namespace dnn {
60 CV__DNN_EXPERIMENTAL_NS_BEGIN
61
62 // this option is useful to run valgrind memory errors detection
63 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
64
65 #ifdef HAVE_OPENCL
66 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
67 #endif
68
69 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
70 #ifdef HAVE_INF_ENGINE
71     (size_t)DNN_BACKEND_INFERENCE_ENGINE
72 #else
73     (size_t)DNN_BACKEND_OPENCV
74 #endif
75 );
76
77 // Additional checks (slowdowns execution!)
78 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
79 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
80 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
81
82 using std::vector;
83 using std::map;
84 using std::make_pair;
85 using std::set;
86
87 namespace
88 {
89     typedef std::vector<MatShape> ShapesVec;
90
91     struct LayerShapes
92     {
93         ShapesVec in, out, internal;
94         // No guarantees that layer which support in-place computations
95         // will be computed in-place (input.data_ptr == output.data_ptr).
96         // If layer said that it could work in-place and layers after it
97         // no longer use input blob, we'll set output = input.
98         bool supportInPlace;
99         LayerShapes() {supportInPlace = false;}
100     };
101 }
102
103 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
104                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
105 {
106     CV_TRACE_FUNCTION();
107     Mat blob;
108     blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
109     return blob;
110 }
111
112 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
113                    const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
114 {
115     CV_TRACE_FUNCTION();
116     std::vector<Mat> images(1, image.getMat());
117     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
118 }
119
120 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
121                    const Scalar& mean, bool swapRB, bool crop, int ddepth)
122 {
123     CV_TRACE_FUNCTION();
124     Mat blob;
125     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
126     return blob;
127 }
128
129 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
130                     Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
131 {
132     CV_TRACE_FUNCTION();
133     CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
134     if (ddepth == CV_8U)
135     {
136         CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
137         CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
138     }
139
140     std::vector<Mat> images;
141     images_.getMatVector(images);
142     CV_Assert(!images.empty());
143     for (int i = 0; i < images.size(); i++)
144     {
145         Size imgSize = images[i].size();
146         if (size == Size())
147             size = imgSize;
148         if (size != imgSize)
149         {
150             if(crop)
151             {
152               float resizeFactor = std::max(size.width / (float)imgSize.width,
153                                             size.height / (float)imgSize.height);
154               resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
155               Rect crop(Point(0.5 * (images[i].cols - size.width),
156                               0.5 * (images[i].rows - size.height)),
157                         size);
158               images[i] = images[i](crop);
159             }
160             else
161               resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
162         }
163         if(images[i].depth() == CV_8U && ddepth == CV_32F)
164             images[i].convertTo(images[i], CV_32F);
165         Scalar mean = mean_;
166         if (swapRB)
167             std::swap(mean[0], mean[2]);
168
169         images[i] -= mean;
170         images[i] *= scalefactor;
171     }
172
173     size_t i, nimages = images.size();
174     Mat image0 = images[0];
175     int nch = image0.channels();
176     CV_Assert(image0.dims == 2);
177     Mat image;
178     if (nch == 3 || nch == 4)
179     {
180         int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
181         blob_.create(4, sz, ddepth);
182         Mat blob = blob_.getMat();
183         Mat ch[4];
184
185         for( i = 0; i < nimages; i++ )
186         {
187             image = images[i];
188             CV_Assert(image.depth() == blob_.depth());
189             nch = image.channels();
190             CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
191             CV_Assert(image.size() == image0.size());
192
193             for( int j = 0; j < nch; j++ )
194                 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
195             if(swapRB)
196                 std::swap(ch[0], ch[2]);
197             split(image, ch);
198         }
199     }
200     else
201     {
202        CV_Assert(nch == 1);
203        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
204        blob_.create(4, sz, ddepth);
205        Mat blob = blob_.getMat();
206
207        for( i = 0; i < nimages; i++ )
208        {
209            Mat image = images[i];
210            CV_Assert(image.depth() == blob_.depth());
211            nch = image.channels();
212            CV_Assert(image.dims == 2 && (nch == 1));
213            CV_Assert(image.size() == image0.size());
214
215            image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
216        }
217     }
218 }
219
220 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
221 {
222     CV_TRACE_FUNCTION();
223
224     //A blob is a 4 dimensional matrix in floating point precision
225     //blob_[0] = batchSize = nbOfImages
226     //blob_[1] = nbOfChannels
227     //blob_[2] = height
228     //blob_[3] = width
229     CV_Assert(blob_.depth() == CV_32F);
230     CV_Assert(blob_.dims == 4);
231
232     images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
233
234     std::vector<Mat> vectorOfChannels(blob_.size[1]);
235     for (int n = 0; n <  blob_.size[0]; ++n)
236     {
237         for (int c = 0; c < blob_.size[1]; ++c)
238         {
239             vectorOfChannels[c] = getPlane(blob_, n, c);
240         }
241         cv::merge(vectorOfChannels, images_.getMatRef(n));
242     }
243 }
244
245 class OpenCLBackendWrapper : public BackendWrapper
246 {
247 public:
248     OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
249     {
250         m.copyTo(umat);
251         host = &m;
252         hostDirty = false;
253     }
254
255     OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
256         : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
257     {
258         Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
259         CV_Assert(!base.empty());
260
261         host = &m;
262
263         int shape[] = {1, (int)base->umat.total()};
264         umat = base->umat.reshape(1, 2, &shape[0])
265                          .colRange(0, host->total())
266                          .reshape(1, host->dims, &host->size[0]);
267         hostDirty = false;
268     }
269
270     static Ptr<BackendWrapper> create(Mat& m)
271     {
272         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
273     }
274
275     static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
276     {
277         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
278     }
279
280     static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
281     {
282         const int numWrappers = wrappers.size();
283         std::vector<UMat> mats(wrappers.size());
284         for (int i = 0; i < numWrappers; ++i)
285         {
286             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
287             CV_Assert(!umatWrapper.empty());
288             umatWrapper->copyToDevice();
289             mats[i] = umatWrapper->umat;
290         }
291         return mats;
292     }
293
294     // Replaces all umats in wrappers to specific ones.
295     static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
296                        const std::vector<UMat>& umats)
297     {
298         CV_Assert(wrappers.size() == umats.size());
299         for (int i = 0, n = umats.size(); i < n; ++i)
300         {
301             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
302             CV_Assert(!umatWrapper.empty());
303             umatWrapper->umat = umats[i];
304         }
305     }
306
307     ~OpenCLBackendWrapper() {}
308
309     // Copies data from device to a host memory.
310     virtual void copyToHost() CV_OVERRIDE
311     {
312         umat.copyTo(*host);
313     }
314
315     virtual void setHostDirty() CV_OVERRIDE
316     {
317         hostDirty = true;
318     };
319
320     void copyToDevice()
321     {
322         if (hostDirty)
323         {
324             host->copyTo(umat);
325             hostDirty = false;
326         }
327     }
328
329 private:
330     UMat umat;
331     Mat* host;
332     bool hostDirty;
333 };
334
335 struct LayerPin
336 {
337     int lid;
338     int oid;
339
340     LayerPin(int layerId = -1, int outputId = -1)
341         : lid(layerId), oid(outputId) {}
342
343     bool valid() const
344     {
345         return (lid >= 0 && oid >= 0);
346     }
347
348     bool equal(const LayerPin &r) const
349     {
350         return (lid == r.lid && oid == r.oid);
351     }
352
353     bool operator<(const LayerPin &r) const
354     {
355         return lid < r.lid || lid == r.lid && oid < r.oid;
356     }
357
358     bool operator ==(const LayerPin &r) const
359     {
360         return lid == r.lid && oid == r.oid;
361     }
362 };
363
364 struct LayerData
365 {
366     LayerData() : id(-1), skip(false), flag(0) {}
367     LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
368         : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
369     {
370         CV_TRACE_FUNCTION();
371
372         //add logging info
373         params.name = name;
374         params.type = type;
375     }
376
377     int id;
378     String name;
379     String type;
380     LayerParams params;
381
382     std::vector<LayerPin> inputBlobsId;
383     std::set<int> inputLayersId;
384     std::set<int> requiredOutputs;
385     std::vector<LayerPin> consumers;
386     std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
387     std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
388     std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
389
390     Ptr<Layer> layerInstance;
391     std::vector<Mat> outputBlobs;
392     std::vector<Mat*> inputBlobs;
393     std::vector<Mat> internals;
394     // Computation nodes of implemented backends (except DEFAULT).
395     std::map<int, Ptr<BackendNode> > backendNodes;
396     // Flag for skip layer computation for specific backend.
397     bool skip;
398
399     int flag;
400
401     Ptr<Layer> getLayerInstance()
402     {
403         CV_TRACE_FUNCTION();
404         CV_TRACE_ARG_VALUE(type, "type", type.c_str());
405
406         if (layerInstance)
407             return layerInstance;
408
409         layerInstance = LayerFactory::createLayerInstance(type, params);
410         if (!layerInstance)
411         {
412             CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
413         }
414
415         return layerInstance;
416     }
417 };
418
419 //fake layer containing network input blobs
420 struct DataLayer : public Layer
421 {
422     DataLayer() : Layer()
423     {
424         skip = false;
425     }
426
427     virtual bool supportBackend(int backendId) CV_OVERRIDE
428     {
429         return backendId == DNN_BACKEND_OPENCV ||
430                backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1;
431     }
432
433     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
434     {
435         CV_TRACE_FUNCTION();
436         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
437
438         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
439                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
440
441         if (outputs_arr.depth() == CV_16S)
442         {
443             forward_fallback(inputs_arr, outputs_arr, internals_arr);
444             return;
445         }
446
447         std::vector<Mat> outputs, internals;
448         outputs_arr.getMatVector(outputs);
449         internals_arr.getMatVector(internals);
450
451         // Supported modes:
452         // | Input type | Output type |
453         // |       fp32 |        fp32 |
454         // |      uint8 |        fp32 |
455         for (int i = 0; i < inputsData.size(); ++i)
456         {
457             double scale = scaleFactors[i];
458             Scalar& mean = means[i];
459             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
460             CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
461
462             bool singleMean = true;
463             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
464             {
465                 singleMean = mean[j] == mean[j - 1];
466             }
467
468             if (singleMean)
469             {
470                 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
471             }
472             else
473             {
474                 for (int n = 0; n < inputsData[i].size[0]; ++n)
475                     for (int c = 0; c < inputsData[i].size[1]; ++c)
476                     {
477                         Mat inp = getPlane(inputsData[i], n, c);
478                         Mat out = getPlane(outputs[i], n, c);
479                         inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
480                     }
481             }
482         }
483     }
484
485 #ifdef HAVE_OPENCL
486     std::vector<Mat> tmp_expressions;
487     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
488     {
489         // Supported modes:
490         // | Input type | Output type |
491         // |       fp32 |        fp32 |
492         // |       fp32 |        fp16 |
493         // |      uint8 |        fp32 |
494         std::vector<UMat> outputs;
495         outputs_.getUMatVector(outputs);
496
497         tmp_expressions.clear();
498         for (int i = 0; i < inputsData.size(); ++i)
499         {
500             Mat inputData = inputsData[i];
501
502             double scale = scaleFactors[i];
503             Scalar& mean = means[i];
504
505             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
506             bool singleMean = true;
507             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
508             {
509                 singleMean = mean[j] == mean[j - 1];
510             }
511
512             if (outputs_.depth() == CV_16S)
513             {
514                 if (singleMean)
515                 {
516                     tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
517                     convertFp16(tmp_expressions.back(), outputs[i]);
518                 }
519                 else
520                 {
521                     for (int n = 0; n < inputsData[i].size[0]; ++n)
522                         for (int c = 0; c < inputsData[i].size[1]; ++c)
523                         {
524                             Mat inp = getPlane(inputsData[i], n, c);
525
526                             std::vector<cv::Range> plane(4, Range::all());
527                             plane[0] = Range(n, n + 1);
528                             plane[1] = Range(c, c + 1);
529                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
530
531                             tmp_expressions.push_back(scale * (inp - mean[c]));
532                             convertFp16(tmp_expressions.back(), out);
533                         }
534                 }
535             }
536             else
537             {
538                 CV_Assert(outputs_.depth() == CV_32F);
539                 if (singleMean)
540                 {
541                     inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
542                 }
543                 else
544                 {
545                     for (int n = 0; n < inputsData[i].size[0]; ++n)
546                         for (int c = 0; c < inputsData[i].size[1]; ++c)
547                         {
548                             Mat inp = getPlane(inputsData[i], n, c);
549
550                             std::vector<cv::Range> plane(4, Range::all());
551                             plane[0] = Range(n, n + 1);
552                             plane[1] = Range(c, c + 1);
553                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
554
555                             inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
556                         }
557                 }
558             }
559         }
560         return true;
561     }
562 #endif
563
564     int outputNameToIndex(const String& tgtName) CV_OVERRIDE
565     {
566         int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
567         return (idx < (int)outNames.size()) ? idx : -1;
568     }
569
570     void setNames(const std::vector<String> &names)
571     {
572         outNames.assign(names.begin(), names.end());
573     }
574
575     bool getMemoryShapes(const std::vector<MatShape> &inputs,
576                          const int requiredOutputs,
577                          std::vector<MatShape> &outputs,
578                          std::vector<MatShape> &internals) const CV_OVERRIDE
579     {
580         CV_Assert(inputs.size() == requiredOutputs);
581         outputs.assign(inputs.begin(), inputs.end());
582         return false;
583     }
584
585     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
586     {
587         std::vector<Mat> outputs;
588         outputs_arr.getMatVector(outputs);
589
590         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
591                   inputsData.size() == outputs.size());
592         skip = true;
593         for (int i = 0; skip && i < inputsData.size(); ++i)
594         {
595             if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
596                 skip = false;
597         }
598     }
599
600     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
601     {
602 #ifdef HAVE_INF_ENGINE
603         InferenceEngine::LayerParams lp;
604         lp.name = name;
605         lp.type = "ScaleShift";
606         lp.precision = InferenceEngine::Precision::FP32;
607         std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
608
609         CV_CheckEQ(inputsData.size(), (size_t)1, "");
610         CV_CheckEQ(inputsData[0].dims, 4, "");
611         const size_t numChannels = inputsData[0].size[1];
612         CV_Assert(numChannels <= 4);
613
614         // Scale
615         auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
616                                                                 {numChannels});
617         weights->allocate();
618         weights->set(std::vector<float>(numChannels, scaleFactors[0]));
619         ieLayer->_weights = weights;
620
621         // Mean subtraction
622         auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
623                                                                {numChannels});
624         biases->allocate();
625         std::vector<float> biasesVec(numChannels);
626         for (int i = 0; i < numChannels; ++i)
627         {
628             biasesVec[i] = -means[0][i] * scaleFactors[0];
629         }
630         biases->set(biasesVec);
631         ieLayer->_biases = biases;
632
633         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
634 #endif  // HAVE_INF_ENGINE
635         return Ptr<BackendNode>();
636     }
637
638     std::vector<String> outNames;
639     // Preprocessing parameters for each network's input.
640     std::vector<double> scaleFactors;
641     std::vector<Scalar> means;
642     std::vector<Mat> inputsData;
643     bool skip;
644 };
645
646 struct BlobManager
647 {
648 public:
649     // Increase references counter to layer output.
650     void addReference(const LayerPin& lp)
651     {
652         std::map<LayerPin, int>::iterator it = refCounter.find(lp);
653         if (it == refCounter.end())
654             refCounter[lp] = 1;
655         else
656             it->second += 1;
657     }
658
659     void addReferences(const std::vector<LayerPin>& pins)
660     {
661         for (int i = 0; i < pins.size(); i++)
662         {
663             addReference(pins[i]);
664         }
665     }
666
667     // Returns number of references to allocated memory that used in specific
668     // layer blob.
669     int numReferences(const LayerPin& lp)
670     {
671         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
672         CV_Assert(mapIt != reuseMap.end());
673         LayerPin memHost = mapIt->second;
674
675         std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
676         CV_Assert(refIt != refCounter.end());
677         return refIt->second;
678     }
679
680     // Reuse data allocated in <host> inside the <user> blob.
681     void reuse(const LayerPin& host, const LayerPin& user)
682     {
683         CV_Assert(reuseMap.find(user) == reuseMap.end());
684         CV_Assert(reuseMap.find(host) != reuseMap.end());
685         LayerPin memHost = reuseMap[host];
686         reuseMap[user] = memHost;
687         if (refCounter.find(memHost) != refCounter.end())
688         {
689             std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
690             if (userRefIt != refCounter.end())
691             {
692                 refCounter[memHost] += userRefIt->second;
693                 refCounter.erase(userRefIt);
694             }
695             else
696                 refCounter[memHost] += 1;
697         }
698     }
699
700     // Decrease references counter to allocated memory inside specific blob.
701     void releaseReference(const LayerPin& lp)
702     {
703         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
704         CV_Assert(mapIt != reuseMap.end());
705
706         std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
707         CV_Assert(refIt != refCounter.end());
708         CV_Assert(refIt->second > 0);
709         refIt->second -= 1;
710     }
711
712     void releaseReferences(const std::vector<LayerPin>& pins)
713     {
714         for (int i = 0; i < pins.size(); i++)
715         {
716             releaseReference(pins[i]);
717         }
718     }
719
720     void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
721     {
722         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
723         {
724             Mat bestBlob;
725             LayerPin bestBlobPin;
726
727             std::map<LayerPin, Mat>::iterator hostIt;
728             std::map<LayerPin, int>::iterator refIt;
729
730             const int targetTotal = total(shape);
731             int bestBlobTotal = INT_MAX;
732
733             for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
734             {
735                 refIt = refCounter.find(hostIt->first);
736                 // Use only blobs that had references before because if not,
737                 // it might be used as output.
738                 if (refIt != refCounter.end() && refIt->second == 0)
739                 {
740                     Mat& unusedBlob = hostIt->second;
741                     if (unusedBlob.total() >= targetTotal &&
742                         unusedBlob.total() < bestBlobTotal)
743                     {
744                         bestBlobPin = hostIt->first;
745                         bestBlob = unusedBlob;
746                         bestBlobTotal = unusedBlob.total();
747                     }
748                 }
749             }
750             if (!bestBlob.empty())
751             {
752                 reuse(bestBlobPin, lp);
753                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
754                 return;
755             }
756         }
757
758         {
759             // if dst already has been allocated with total(shape) elements,
760             // it won't be recreated and pointer of dst.data remains the same.
761             dst.create(shape, use_half ? CV_16S : CV_32F);
762             addHost(lp, dst);
763         }
764     }
765
766     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
767                                std::vector<LayerPin>& pinsForInternalBlobs,
768                                bool use_half = false)
769     {
770         CV_TRACE_FUNCTION();
771
772         pinsForInternalBlobs.clear();
773
774         std::vector<Mat>& outputBlobs = ld.outputBlobs,
775                 &internalBlobs = ld.internals;
776
777         const ShapesVec& outShapes = layerShapes.out,
778                 internalShapes = layerShapes.internal;
779
780         outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
781         internalBlobs.resize(internalShapes.size());
782
783         CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
784
785         // Check that layer could work in-place.
786         bool inPlace = false;
787         if (layerShapes.supportInPlace)
788         {
789             if (ld.inputBlobs.size() == 1)
790             {
791                 // Get number of references to the input memory.
792                 int numRef = numReferences(ld.inputBlobsId[0]);
793                 // If current layer is one and only customer of this blob.
794                 inPlace = numRef == 1;
795             }
796         }
797
798         ShapesVec shapes(outShapes);
799         shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
800         std::vector<Mat*> blobs;
801         for(int i = 0; i < outputBlobs.size(); i++)
802         {
803             blobs.push_back(&outputBlobs[i]);
804         }
805
806         for(int i = 0; i < internalBlobs.size(); i++)
807         {
808             blobs.push_back(&internalBlobs[i]);
809             if (total(internalShapes[i]))
810             {
811                 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
812             }
813         }
814
815         addReferences(pinsForInternalBlobs);
816
817         std::map<int, std::vector<int> > idxSizes;
818         for(int i = 0; i < shapes.size(); i++)
819         {
820             idxSizes[total(shapes[i])].push_back(i);
821         }
822
823         std::map<int, std::vector<int> >::reverse_iterator it;
824         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
825         {
826             for(int j = 0; j < it->second.size(); j++)
827             {
828                 int index = it->second[j];
829                 if (total(shapes[index]))
830                 {
831                     LayerPin blobPin(ld.id, index);
832                     if (index < outShapes.size() && inPlace)
833                     {
834                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
835                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
836                         reuse(ld.inputBlobsId[0], blobPin);
837                     }
838                     else
839                         reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
840                 }
841             }
842         }
843     }
844
845     // Clear internal state. Calls before an every reallocation.
846     void reset()
847     {
848         CV_TRACE_FUNCTION();
849
850         refCounter.clear();
851         reuseMap.clear();
852         memHosts.clear();
853     }
854
855 private:
856     // Register allocated memory.
857     void addHost(const LayerPin& lp, const Mat& mat)
858     {
859         CV_Assert(memHosts.find(lp) == memHosts.end());
860         reuseMap[lp] = lp;
861         memHosts[lp] = mat;
862     }
863
864     std::map<LayerPin, int> refCounter;
865     // Maps pin to origin blob (for whom memory was allocated firstly).
866     // For origin blobs key == value.
867     std::map<LayerPin, LayerPin> reuseMap;
868     std::map<LayerPin, Mat> memHosts;
869 };
870
871 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
872 {
873     if (backendId == DNN_BACKEND_OPENCV)
874     {
875         if (targetId == DNN_TARGET_CPU)
876             return Ptr<BackendWrapper>();
877         else if (IS_DNN_OPENCL_TARGET(targetId))
878             return OpenCLBackendWrapper::create(m);
879         else
880             CV_Error(Error::StsNotImplemented, "Unknown target identifier");
881     }
882     else if (backendId == DNN_BACKEND_HALIDE)
883     {
884         CV_Assert(haveHalide());
885 #ifdef HAVE_HALIDE
886         return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
887 #endif  // HAVE_HALIDE
888     }
889     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
890     {
891         CV_Assert(haveInfEngine());
892 #ifdef HAVE_INF_ENGINE
893         return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
894 #endif  // HAVE_INF_ENGINE
895     }
896     else
897         CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
898     return Ptr<BackendWrapper>();
899 }
900
901 struct Net::Impl
902 {
903     typedef std::map<int, LayerShapes> LayersShapesMap;
904     typedef std::map<int, LayerData> MapIdToLayerData;
905
906     Impl()
907     {
908         //allocate fake net input layer
909         netInputLayer = Ptr<DataLayer>(new DataLayer());
910         LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
911         inpl.id = 0;
912         netInputLayer->name = inpl.name = "_input";
913         inpl.type = "__NetInputLayer__";
914         inpl.layerInstance = netInputLayer;
915         layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
916
917         lastLayerId = 0;
918         netWasAllocated = false;
919         fusion = true;
920         preferableBackend = DNN_BACKEND_DEFAULT;
921         preferableTarget = DNN_TARGET_CPU;
922         skipInfEngineInit = false;
923     }
924
925     Ptr<DataLayer> netInputLayer;
926     std::vector<LayerPin> blobsToKeep;
927     MapIdToLayerData layers;
928     std::map<String, int> layerNameToId;
929     BlobManager blobManager;
930     int preferableBackend;
931     int preferableTarget;
932     String halideConfigFile;
933     bool skipInfEngineInit;
934     // Map host data to backend specific wrapper.
935     std::map<void*, Ptr<BackendWrapper> > backendWrappers;
936
937     int lastLayerId;
938
939     bool netWasAllocated;
940     bool fusion;
941     std::vector<int64> layersTimings;
942     Mat output_blob;
943
944     Ptr<BackendWrapper> wrap(Mat& host)
945     {
946         if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
947             return Ptr<BackendWrapper>();
948
949         MatShape shape(host.dims);
950         for (int i = 0; i < host.dims; ++i)
951             shape[i] = host.size[i];
952
953         void* data = host.data;
954         if (backendWrappers.find(data) != backendWrappers.end())
955         {
956             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
957             if (preferableBackend == DNN_BACKEND_OPENCV)
958             {
959                 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
960                 return OpenCLBackendWrapper::create(baseBuffer, host);
961             }
962             else if (preferableBackend == DNN_BACKEND_HALIDE)
963             {
964                 CV_Assert(haveHalide());
965   #ifdef HAVE_HALIDE
966                 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
967   #endif  // HAVE_HALIDE
968             }
969             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
970             {
971                 return wrapMat(preferableBackend, preferableTarget, host);
972             }
973             else
974                 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
975         }
976
977         Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
978         backendWrappers[data] = wrapper;
979         return wrapper;
980     }
981
982 #ifdef HAVE_HALIDE
983     void compileHalide()
984     {
985         CV_TRACE_FUNCTION();
986
987         CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
988
989         HalideScheduler scheduler(halideConfigFile);
990         std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
991         for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
992         {
993             LayerData &ld = it->second;
994             Ptr<Layer> layer = ld.layerInstance;
995             if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
996             {
997                 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
998                 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
999                 if (!scheduled)
1000                 {
1001                     // Use automatic scheduling provided by layer.
1002                     layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1003                                                 ld.inputBlobs, ld.outputBlobs,
1004                                                 preferableTarget);
1005                 }
1006                 compileList.emplace_back(ld);
1007             }
1008         }
1009         std::atomic<int> progress(0);
1010         auto fn = ([&] () -> void
1011         {
1012             for (;;)
1013             {
1014                 int id = progress.fetch_add(1);
1015                 if ((size_t)id >= compileList.size())
1016                     return;
1017                 const LayerData& ld = compileList[id].get();
1018                 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1019                 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1020             }
1021         });
1022         size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1023         num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1024         std::vector<std::thread> threads(num_threads - 1);
1025         for (auto& t: threads) t = std::thread(fn);
1026         fn(); // process own tasks
1027         for (auto& t: threads) t.join();
1028     }
1029 #endif
1030
1031     void clear()
1032     {
1033         CV_TRACE_FUNCTION();
1034
1035         MapIdToLayerData::iterator it;
1036         for (it = layers.begin(); it != layers.end(); it++)
1037         {
1038             if (it->second.id != 0) {
1039                 it->second.inputBlobs.clear();
1040                 it->second.outputBlobs.clear();
1041                 it->second.internals.clear();
1042             }
1043             it->second.skip = false;
1044             //it->second.consumers.clear();
1045             Ptr<Layer> currLayer = it->second.layerInstance;
1046
1047             if( currLayer.empty() )
1048                 continue;
1049
1050             currLayer->unsetAttached();
1051
1052             Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
1053             if( !poolingLayer.empty() )
1054             {
1055                 poolingLayer->computeMaxIdx = true;
1056             }
1057         }
1058
1059         layersTimings.clear();
1060     }
1061
1062     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1063     {
1064         CV_TRACE_FUNCTION();
1065
1066         if (preferableBackend == DNN_BACKEND_DEFAULT)
1067             preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1068
1069         CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1070                   preferableTarget == DNN_TARGET_CPU ||
1071                   preferableTarget == DNN_TARGET_OPENCL ||
1072                   preferableTarget == DNN_TARGET_OPENCL_FP16);
1073         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1074                   preferableTarget == DNN_TARGET_CPU ||
1075                   preferableTarget == DNN_TARGET_OPENCL);
1076         CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
1077                   preferableTarget == DNN_TARGET_CPU ||
1078                   preferableTarget == DNN_TARGET_OPENCL ||
1079                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1080                   preferableTarget == DNN_TARGET_MYRIAD);
1081         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1082         {
1083             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1084 #ifndef HAVE_OPENCL
1085             {
1086                 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1087                 preferableTarget = DNN_TARGET_CPU;
1088             }
1089 #else
1090             {
1091                 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1092                 {
1093                     // Current implementation is only valid for GPU (#11494)
1094                     if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1095                     {
1096                         CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1097                         preferableTarget = DNN_TARGET_CPU;
1098                     }
1099                     else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1100                     {
1101                         CV_LOG_WARNING(NULL,
1102                             "DNN: OpenCL target with fp16 precision is not supported "
1103                             "with current OpenCL device (tested with Intel GPUs only), "
1104                             "switching to OpenCL with fp32 precision.");
1105                         preferableTarget = DNN_TARGET_OPENCL;
1106                     }
1107                 }
1108             }
1109 #endif
1110             clear();
1111
1112             allocateLayers(blobsToKeep_);
1113
1114             MapIdToLayerData::iterator it = layers.find(0);
1115             CV_Assert(it != layers.end());
1116             it->second.skip = netInputLayer->skip;
1117
1118             initBackend();
1119
1120             if (!netWasAllocated )
1121             {
1122 #ifdef HAVE_HALIDE
1123                 if (preferableBackend == DNN_BACKEND_HALIDE)
1124                     compileHalide();
1125 #else
1126                 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1127 #endif
1128             }
1129
1130             netWasAllocated = true;
1131             this->blobsToKeep = blobsToKeep_;
1132         }
1133     }
1134
1135     int getLayerId(const String &layerName)
1136     {
1137         std::map<String, int>::iterator it = layerNameToId.find(layerName);
1138         return (it != layerNameToId.end()) ? it->second : -1;
1139     }
1140
1141     int getLayerId(int id)
1142     {
1143         MapIdToLayerData::iterator it = layers.find(id);
1144         return (it != layers.end()) ? id : -1;
1145     }
1146
1147     int getLayerId(DictValue &layerDesc)
1148     {
1149         if (layerDesc.isInt())
1150             return getLayerId(layerDesc.get<int>());
1151         else if (layerDesc.isString())
1152             return getLayerId(layerDesc.get<String>());
1153
1154         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1155         return -1;
1156     }
1157
1158     String getLayerName(int id)
1159     {
1160         MapIdToLayerData::iterator it = layers.find(id);
1161         return (it != layers.end()) ? it->second.name : "(unknown layer)";
1162     }
1163
1164     LayerData& getLayerData(int id)
1165     {
1166         MapIdToLayerData::iterator it = layers.find(id);
1167
1168         if (it == layers.end())
1169             CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1170
1171         return it->second;
1172     }
1173
1174     LayerData& getLayerData(const String &layerName)
1175     {
1176         int id = getLayerId(layerName);
1177
1178         if (id < 0)
1179             CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1180
1181         return getLayerData(id);
1182     }
1183
1184     LayerData& getLayerData(const DictValue &layerDesc)
1185     {
1186         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1187         if (layerDesc.isInt())
1188             return getLayerData(layerDesc.get<int>());
1189         else /*if (layerDesc.isString())*/
1190             return getLayerData(layerDesc.get<String>());
1191     }
1192
1193     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1194     {
1195         if ((int)ld.inputBlobsId.size() <= inNum)
1196         {
1197             ld.inputBlobsId.resize(inNum + 1);
1198         }
1199         else
1200         {
1201             LayerPin storedFrom = ld.inputBlobsId[inNum];
1202             if (storedFrom.valid() && !storedFrom.equal(from))
1203                 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1204                                                  inNum, ld.name.c_str()));
1205         }
1206
1207         ld.inputBlobsId[inNum] = from;
1208     }
1209
1210     int resolvePinOutputName(LayerData &ld, const String &outName)
1211     {
1212         if (outName.empty())
1213             return 0;
1214         return ld.getLayerInstance()->outputNameToIndex(outName);
1215     }
1216
1217     LayerPin getPinByAlias(const String &layerName)
1218     {
1219         LayerPin pin;
1220         pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1221
1222         if (pin.lid >= 0)
1223             pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1224
1225         return pin;
1226     }
1227
1228     std::vector<LayerPin> getLayerOutPins(const String &layerName)
1229     {
1230         int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1231
1232         std::vector<LayerPin> pins;
1233
1234         for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1235         {
1236             pins.push_back(LayerPin(lid, i));
1237         }
1238
1239         return pins;
1240     }
1241
1242     void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1243     {
1244         CV_Assert(outLayerId < inLayerId);
1245         LayerData &ldOut = getLayerData(outLayerId);
1246         LayerData &ldInp = getLayerData(inLayerId);
1247
1248         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1249         ldOut.requiredOutputs.insert(outNum);
1250         ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1251     }
1252
1253     void initBackend()
1254     {
1255         CV_TRACE_FUNCTION();
1256         if (preferableBackend == DNN_BACKEND_OPENCV)
1257             CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1258         else if (preferableBackend == DNN_BACKEND_HALIDE)
1259             initHalideBackend();
1260         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1261             initInfEngineBackend();
1262         else
1263             CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1264     }
1265
1266     void initHalideBackend()
1267     {
1268         CV_TRACE_FUNCTION();
1269         CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1270
1271         // Iterator to current layer.
1272         MapIdToLayerData::iterator it = layers.begin();
1273         // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1274         // it'll be a conv layer.
1275         MapIdToLayerData::iterator baseIt = layers.begin();
1276         for (; it != layers.end(); it++)
1277         {
1278             LayerData &ldTop = it->second;
1279             Ptr<Layer> layerTop = ldTop.layerInstance;
1280             if (!layerTop->supportBackend(preferableBackend))
1281             {
1282                 // Move base iterator to layer that don't support preferable
1283                 // backend to prevent fusion over layer of different backend.
1284                 baseIt = it;
1285                 continue;
1286             }
1287             // Try to do layers fusion.
1288             LayerData &ldBot = baseIt->second;
1289             Ptr<Layer> layerBot = ldBot.layerInstance;
1290             // 1. Check that bottom and top from the same backends.
1291             if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1292             {
1293                 // 2. Check that current layer works in-place.
1294                 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1295                                ldBot.outputBlobs.size() == 1 &&
1296                                ldTop.inputBlobs[0]->data ==
1297                                ldBot.outputBlobs[0].data;
1298                 if (inPlace)
1299                 {
1300                     // 3. Try to attach node.
1301                     CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1302                     Ptr<BackendNode> fusedNode =
1303                         layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1304                     if (!fusedNode.empty())
1305                     {
1306                         ldTop.skip = true;
1307                         ldBot.backendNodes[preferableBackend] = fusedNode;
1308                         ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1309                         continue;
1310                     }
1311                 }
1312             }
1313             // No layers fusion.
1314             ldTop.skip = false;
1315             ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1316                 layerTop->initHalide(ldTop.inputBlobsWrappers);
1317             baseIt = it;
1318         }
1319     }
1320
1321 #ifdef HAVE_INF_ENGINE
1322     // Before launching Inference Engine graph we need to specify output blobs.
1323     // This function requests output blobs based on inputs references of
1324     // layers from default backend or layers from different graphs.
1325     void addInfEngineNetOutputs(LayerData &ld)
1326     {
1327         Ptr<InfEngineBackendNet> layerNet;
1328         if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1329         {
1330             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1331             if (!node.empty())
1332             {
1333                 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1334                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1335                 layerNet = ieNode->net;
1336             }
1337         }
1338         // For an every input reference we check that it belongs to one of
1339         // the Inference Engine backend graphs. Request an output blob if it is.
1340         // Do nothing if layer's input is from the same graph.
1341         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1342         {
1343             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1344             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1345             if (!inpNode.empty())
1346             {
1347                 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1348                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1349                 if (layerNet != ieInpNode->net)
1350                 {
1351                     // layerNet is empty or nodes are from different graphs.
1352                     ieInpNode->net->addOutput(ieInpNode->layer->name);
1353                 }
1354             }
1355         }
1356     }
1357 #endif  // HAVE_INF_ENGINE
1358
1359     void initInfEngineBackend()
1360     {
1361         CV_TRACE_FUNCTION();
1362         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE, haveInfEngine());
1363 #ifdef HAVE_INF_ENGINE
1364         MapIdToLayerData::iterator it;
1365         Ptr<InfEngineBackendNet> net;
1366
1367         for (it = layers.begin(); it != layers.end(); ++it)
1368         {
1369             LayerData &ld = it->second;
1370             if (ld.id == 0)
1371             {
1372                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1373                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1374                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1375                 {
1376                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1377                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1378                 }
1379             }
1380             else
1381             {
1382                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1383                 {
1384                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1385                     dataPtr->name = ld.name;
1386                 }
1387             }
1388         }
1389
1390         if (skipInfEngineInit)
1391         {
1392             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1393             CV_Assert(!node.empty());
1394
1395             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1396             CV_Assert(!ieNode.empty());
1397
1398             for (it = layers.begin(); it != layers.end(); ++it)
1399             {
1400                 LayerData &ld = it->second;
1401                 if (ld.id == 0)
1402                 {
1403                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1404                     {
1405                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1406                         dataPtr->name = netInputLayer->outNames[i];
1407                     }
1408                 }
1409                 else
1410                 {
1411                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1412                     {
1413                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1414                         dataPtr->name = ld.name;
1415                     }
1416                 }
1417                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1418                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1419                 ld.skip = true;
1420             }
1421             layers[lastLayerId].skip = false;
1422             ieNode->net->init(preferableTarget);
1423             return;
1424         }
1425
1426         // Build Inference Engine networks from sets of layers that support this
1427         // backend. Split a whole model on several Inference Engine networks if
1428         // some of layers is not implemented.
1429
1430         // Set of all input and output blobs wrappers for current network.
1431         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1432         for (it = layers.begin(); it != layers.end(); ++it)
1433         {
1434             LayerData &ld = it->second;
1435             if (ld.id == 0 && ld.skip)
1436                 continue;
1437             bool fused = ld.skip;
1438
1439             Ptr<Layer> layer = ld.layerInstance;
1440             if (!fused && !layer->supportBackend(preferableBackend))
1441             {
1442                 addInfEngineNetOutputs(ld);
1443                 net = Ptr<InfEngineBackendNet>();
1444                 netBlobsWrappers.clear();
1445                 layer->preferableTarget = DNN_TARGET_CPU;
1446                 continue;
1447             }
1448             ld.skip = true;  // Initially skip all Inference Engine supported layers.
1449
1450             // Create a new network if one of inputs from different Inference Engine graph.
1451             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1452             {
1453                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1454                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1455                 if (!inpNode.empty())
1456                 {
1457                     Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1458                     CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1459                     if (ieInpNode->net != net)
1460                     {
1461                         net = Ptr<InfEngineBackendNet>();
1462                         netBlobsWrappers.clear();
1463                         break;
1464                     }
1465                 }
1466             }
1467
1468             // The same blobs wrappers cannot be shared between two Inference Engine
1469             // networks because of explicit references between layers and blobs.
1470             // So we need to rewrap all the external blobs.
1471             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1472             {
1473                 LayerPin inPin = ld.inputBlobsId[i];
1474                 auto it = netBlobsWrappers.find(inPin);
1475                 if (it == netBlobsWrappers.end())
1476                 {
1477                     ld.inputBlobsWrappers[i] = InfEngineBackendWrapper::create(ld.inputBlobsWrappers[i]);
1478                     netBlobsWrappers[inPin] = ld.inputBlobsWrappers[i];
1479                 }
1480                 else
1481                     ld.inputBlobsWrappers[i] = it->second;
1482             }
1483             netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0];
1484
1485             Ptr<BackendNode> node;
1486             if (!net.empty())
1487             {
1488                 if (fused)
1489                 {
1490                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1491                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1492                     CV_Assert(inPlace);
1493                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1494                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1495                 }
1496             }
1497             else
1498                 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1499
1500             if (!fused)
1501             {
1502                 node = layer->initInfEngine(ld.inputBlobsWrappers);
1503             }
1504             else if (node.empty())
1505                 continue;
1506
1507             CV_Assert(!node.empty());
1508             ld.backendNodes[preferableBackend] = node;
1509
1510             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1511             CV_Assert(!ieNode.empty());
1512             ieNode->net = net;
1513
1514             auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
1515             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD) && !fused)
1516             {
1517                 ieNode->layer->precision = InferenceEngine::Precision::FP16;
1518                 if (weightableLayer)
1519                 {
1520                     if (weightableLayer->_weights)
1521                         weightableLayer->_weights = convertFp16(weightableLayer->_weights);
1522                     if (weightableLayer->_biases)
1523                         weightableLayer->_biases = convertFp16(weightableLayer->_biases);
1524                 }
1525                 else
1526                 {
1527                     for (const auto& weights : {"weights", "biases"})
1528                     {
1529                         auto it = ieNode->layer->blobs.find(weights);
1530                         if (it != ieNode->layer->blobs.end())
1531                             it->second = convertFp16(it->second);
1532                     }
1533                 }
1534             }
1535             if (weightableLayer)
1536             {
1537                 if (weightableLayer->_weights)
1538                     weightableLayer->blobs["weights"] = weightableLayer->_weights;
1539                 if (weightableLayer->_biases)
1540                     weightableLayer->blobs["biases"] = weightableLayer->_biases;
1541             }
1542             ieNode->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers);
1543             net->addBlobs(ld.inputBlobsWrappers);
1544             net->addBlobs(ld.outputBlobsWrappers);
1545
1546             if (!fused)
1547                 net->addLayer(ieNode->layer);
1548             addInfEngineNetOutputs(ld);
1549         }
1550
1551         // Initialize all networks.
1552         std::set<InfEngineBackendNet> initializedNets;
1553         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1554         {
1555             LayerData &ld = it->second;
1556             if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1557                 continue;
1558
1559             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1560             if (node.empty())
1561                 continue;
1562
1563             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1564             if (ieNode.empty())
1565                 continue;
1566
1567             CV_Assert(!ieNode->net.empty());
1568
1569             if (!ieNode->net->isInitialized())
1570             {
1571                 ieNode->net->init(preferableTarget);
1572                 ld.skip = false;
1573             }
1574         }
1575 #endif  // HAVE_INF_ENGINE
1576     }
1577
1578     void allocateLayer(int lid, const LayersShapesMap& layersShapes)
1579     {
1580         CV_TRACE_FUNCTION();
1581
1582         LayerData &ld = layers[lid];
1583
1584         //already allocated
1585         if (ld.flag)
1586             return;
1587
1588         size_t ninputs = ld.inputBlobsId.size();
1589 #if 0
1590         printf("layer %s:", ld.name.c_str());
1591         for (size_t i = 0; i < ninputs; i++)
1592         {
1593             int inp_lid = ld.inputBlobsId[i].lid;
1594             LayerData &inp_ld = layers[inp_lid];
1595             int inp_outputs = (int)inp_ld.outputBlobs.size();
1596             std::cout << " " << inp_ld.name << "(" << inp_outputs;
1597
1598             for( int j = 0; j < inp_outputs; j++ )
1599             {
1600                 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
1601             }
1602             std::cout << ")";
1603         }
1604         printf("\n");
1605 #endif
1606
1607         //determine parent layers
1608         for (size_t i = 0; i < ninputs; i++)
1609             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
1610
1611         //allocate parents
1612         for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
1613             allocateLayer(*i, layersShapes);
1614
1615         //bind inputs
1616         if (ld.id == 0)  // DataLayer
1617         {
1618             ninputs = netInputLayer->inputsData.size();
1619             ld.inputBlobsWrappers.resize(ninputs);
1620             for (size_t i = 0; i < ninputs; i++)
1621             {
1622                 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
1623             }
1624         }
1625         else
1626         {
1627             ld.inputBlobs.resize(ninputs);
1628             ld.inputBlobsWrappers.resize(ninputs);
1629             for (size_t i = 0; i < ninputs; i++)
1630             {
1631                 LayerPin from = ld.inputBlobsId[i];
1632                 CV_Assert(from.valid());
1633                 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
1634                 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
1635                 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
1636             }
1637         }
1638
1639         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
1640
1641         CV_Assert(layerShapesIt != layersShapes.end());
1642
1643         std::vector<LayerPin> pinsForInternalBlobs;
1644         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
1645                                           preferableBackend == DNN_BACKEND_OPENCV &&
1646                                           preferableTarget == DNN_TARGET_OPENCL_FP16);
1647         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
1648         for (int i = 0; i < ld.outputBlobs.size(); ++i)
1649         {
1650             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
1651         }
1652         ld.internalBlobsWrappers.resize(ld.internals.size());
1653         for (int i = 0; i < ld.internals.size(); ++i)
1654         {
1655             ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
1656         }
1657
1658         Ptr<Layer> layerPtr = ld.getLayerInstance();
1659         {
1660             std::vector<Mat> inps(ld.inputBlobs.size());
1661             for (int i = 0; i < ld.inputBlobs.size(); ++i)
1662             {
1663                 inps[i] = *ld.inputBlobs[i];
1664             }
1665             layerPtr->finalize(inps, ld.outputBlobs);
1666             layerPtr->preferableTarget = preferableTarget;
1667 #if 0
1668             std::cout << "\toutputs:";
1669             size_t noutputs = ld.outputBlobs.size();
1670             for (size_t j = 0; j < noutputs; j++)
1671             {
1672                 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
1673             }
1674             std::cout << "\n";
1675 #endif
1676         }
1677
1678         // After allocation of layer, we decrease counters to it's input blobs.
1679         blobManager.releaseReferences(ld.inputBlobsId);
1680         blobManager.releaseReferences(pinsForInternalBlobs);
1681
1682         ld.flag = 1;
1683     }
1684
1685 #if 0
1686 #define printf_(args) printf args
1687 #else
1688 #define printf_(args)
1689 #endif
1690
1691     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
1692     {
1693         if( !fusion || preferableBackend != DNN_BACKEND_OPENCV &&
1694                        preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
1695             return;
1696
1697         CV_TRACE_FUNCTION();
1698
1699         // scan through all the layers. If there is convolution layer followed by the activation layer,
1700         // we try to embed this activation into the convolution and disable separate execution of the activation
1701         std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
1702                                       blobsToKeep_.end());
1703         MapIdToLayerData::iterator it;
1704         for (it = layers.begin(); it != layers.end(); it++)
1705         {
1706             int lid = it->first;
1707             LayerData& ld = layers[lid];
1708             if( ld.skip )
1709             {
1710                 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1711                 continue;
1712             }
1713             printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1714
1715             // the optimization #1. try to fuse batch norm, scaling and/or activation layers
1716             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
1717             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
1718             // some other layers.
1719             Ptr<Layer>& currLayer = ld.layerInstance;
1720             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
1721             {
1722                 LayerData* nextData = &layers[ld.consumers[0].lid];
1723                 LayerPin lpNext(ld.consumers[0].lid, 0);
1724                 while (nextData)
1725                 {
1726                     Ptr<Layer> nextLayer = nextData->layerInstance;
1727                     if (currLayer->tryFuse(nextLayer))
1728                     {
1729                         printf_(("\tfused with %s\n", nextLayer->name.c_str()));
1730                         nextData->skip = true;
1731                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1732                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1733                         if (nextData->consumers.size() == 1)
1734                         {
1735                             int nextLayerId = nextData->consumers[0].lid;
1736                             nextData = &layers[nextLayerId];
1737                             lpNext = LayerPin(nextLayerId, 0);
1738                         }
1739                         else
1740                         {
1741                             nextData = 0;
1742                             break;
1743                         }
1744                     }
1745                     else
1746                         break;
1747                 }
1748
1749                 if (preferableBackend != DNN_BACKEND_OPENCV)
1750                     continue;  // Go to the next layer.
1751
1752                 // TODO: OpenCL target support more fusion styles.
1753                 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
1754                      (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
1755                      ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
1756                      ld.layerInstance->type != "Concat")) )
1757                     continue;
1758
1759                 while (nextData)
1760                 {
1761                     // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
1762                     if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
1763                         nextData->type != "ReLU" &&
1764                         nextData->type != "ChannelsPReLU" &&
1765                         nextData->type != "ReLU6" &&
1766                         nextData->type != "TanH" &&
1767                         nextData->type != "Power")
1768                         break;
1769
1770                     Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1771                     if (nextActivLayer.empty())
1772                         break;
1773
1774                     if (currLayer->setActivation(nextActivLayer))
1775                     {
1776                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1777                         nextData->skip = true;
1778                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1779                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1780                         if (nextData->consumers.size() == 1)
1781                         {
1782                             int nextLayerId = nextData->consumers[0].lid;
1783                             nextData = &layers[nextLayerId];
1784                             lpNext = LayerPin(nextLayerId, 0);
1785                         }
1786                         else
1787                         {
1788                             nextData = 0;
1789                             break;
1790                         }
1791                     }
1792                     else
1793                         break;
1794                 }
1795
1796                 // fuse convolution layer followed by eltwise + relu
1797                 if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
1798                 {
1799                     Ptr<EltwiseLayer> nextEltwiseLayer;
1800                     if( nextData )
1801                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
1802
1803                     if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
1804                         nextData && nextData->inputBlobsId.size() == 2 )
1805                     {
1806                         LayerData *eltwiseData = nextData;
1807
1808                         // Eltwise layer has two inputs. We need to determine which
1809                         // is a base convolution layer and which could be used as it's bias.
1810                         LayerData* biasLayerData = 0;
1811                         for (int i = 0; i < 2; ++i)
1812                         {
1813                             LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
1814                             CV_Assert(downLayerData);
1815                             while (downLayerData->skip)
1816                             {
1817                                 if (downLayerData->inputBlobsId.size() == 1)
1818                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
1819                                 else
1820                                 {
1821                                     downLayerData = 0;
1822                                     break;
1823                                 }
1824                             }
1825                             if (downLayerData && ld.id == downLayerData->id)
1826                             {
1827                                 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
1828                                 break;
1829                             }
1830                         }
1831                         CV_Assert(biasLayerData);
1832                         {
1833                             if( eltwiseData->consumers.size() == 1 )
1834                             {
1835                                 // fuse eltwise + activation layer
1836                                 if (biasLayerData->id < ld.id)
1837                                 {
1838                                     nextData = &layers[eltwiseData->consumers[0].lid];
1839                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
1840                                     Ptr<ActivationLayer> nextActivLayer;
1841                                     if( nextData )
1842                                         nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1843
1844                                     if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
1845                                             (!nextData->type.compare("ReLU") ||
1846                                              !nextData->type.compare("ChannelsPReLU") ||
1847                                              !nextData->type.compare("Power")) &&
1848                                             currLayer->setActivation(nextActivLayer) )
1849                                     {
1850                                         CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
1851                                         ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
1852                                         printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
1853                                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1854                                         eltwiseData->skip = true;
1855                                         nextData->skip = true;
1856                                         // This optimization for cases like
1857                                         // some_layer   conv
1858                                         //   |             |
1859                                         //   +-- eltwise --+
1860                                         //          |
1861                                         //        activ
1862                                         // This way all the element-wise computations
1863                                         // (i.e. some_layer+conv or some_layer*conv)
1864                                         // would be done at [conv] layer. So we need to
1865                                         // replace [conv]'s output blob to [eltwise]'s one
1866                                         // considering that [activ] is an in-place layer.
1867                                         // Also we need to move all the consumers' references.
1868                                         // To prevent memory collisions (i.e. when input of
1869                                         // [conv] and output of [eltwise] is the same blob)
1870                                         // we allocate a new blob.
1871                                         CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
1872                                         ld.outputBlobs[0] = ld.outputBlobs[0].clone();
1873                                         ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
1874
1875                                         eltwiseData->outputBlobs = ld.outputBlobs;
1876                                         nextData->outputBlobs = ld.outputBlobs;
1877                                         eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
1878                                         nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
1879
1880                                         // Move references of [activ] layer consumers to the newly allocated blob.
1881                                         for (int i = 0; i < nextData->consumers.size(); ++i)
1882                                         {
1883                                             LayerData& consumer = layers[nextData->consumers[i].lid];
1884                                             for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
1885                                             {
1886                                                 if (consumer.inputBlobsId[j].lid == lpNext.lid)
1887                                                 {
1888                                                     consumer.inputBlobs[j] = &ld.outputBlobs[0];
1889                                                     consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
1890                                                     break;
1891                                                 }
1892                                             }
1893                                         }
1894                                     }
1895                                 }
1896                             }
1897                         }
1898                     }
1899                 }
1900             }
1901
1902             if (preferableBackend != DNN_BACKEND_OPENCV)
1903                 continue;  // Go to the next layer.
1904
1905             // the optimization #2. if there is no layer that takes max pooling layer's computed
1906             // max indices (and only some semantical segmentation networks might need this;
1907             // many others only take the maximum values), then we switch the max pooling
1908             // layer to the faster operating mode.
1909             Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
1910             if( !poolingLayer.empty() && !ld.consumers.empty() )
1911             {
1912                 size_t i = 0, nconsumers = ld.consumers.size();
1913                 for( ; i < nconsumers; i++ )
1914                     if( ld.consumers[i].oid > 0 )
1915                         break;
1916                 // if there is no layer that takes the second output pin of the pooling layer
1917                 // on input then we don't need to compute the indices
1918                 if( i >= nconsumers )
1919                 {
1920                     poolingLayer->computeMaxIdx = false;
1921                     printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
1922                 }
1923             }
1924
1925             // the optimization #3. if there is concat layer that concatenates channels
1926             // from the inputs together (i.e. axis == 1) then we make the inputs of
1927             // the concat layer to write to the concatenation output buffer
1928             // (and so we eliminate the concatenation layer, because the channels
1929             // are concatenated implicitly).
1930             Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
1931             if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
1932                 ld.outputBlobs.size() == 1 )
1933             {
1934                 Mat& output = ld.outputBlobs[0];
1935                 UMat umat_output;
1936                 if (!ld.outputBlobsWrappers.empty() &&
1937                     (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
1938                 {
1939                     size_t i, ninputs = ld.inputBlobsId.size();
1940                     bool conv_layer = true;
1941                     for( i = 0; i < ninputs; i++ )
1942                     {
1943                         LayerPin pin = ld.inputBlobsId[i];
1944                         LayerData* inp_i_data = &layers[pin.lid];
1945                         while(inp_i_data->skip &&
1946                               inp_i_data->inputBlobsId.size() == 1 &&
1947                               inp_i_data->consumers.size() == 1)
1948                         {
1949                             pin = inp_i_data->inputBlobsId[0];
1950                             inp_i_data = &layers[pin.lid];
1951                         }
1952                         conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
1953                     }
1954                     if (!conv_layer)
1955                         continue;
1956                     std::vector<UMat> umat_outputBlobs;
1957                     umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
1958                     umat_output = umat_outputBlobs[0];
1959                 }
1960
1961                 // TODO: in general, this optimization can always be done, but
1962                 // many layers currently check that the input/output blobs are
1963                 // continuous arrays. Unfortunately, this is not true when
1964                 // the concatenation optimization is applied with batch_size > 1.
1965                 // so, for now, we only apply this optimization in the most popular
1966                 // case batch_size == 1.
1967                 if( output.dims == 4 && output.size[0] == 1 )
1968                 {
1969                     size_t i, ninputs = ld.inputBlobsId.size();
1970                     std::vector<LayerPin> realinputs(ninputs);
1971                     for( i = 0; i < ninputs; i++ )
1972                     {
1973                         LayerPin pin = ld.inputBlobsId[i];
1974                         LayerData* inp_i_data = &layers[pin.lid];
1975                         while(inp_i_data->skip &&
1976                               inp_i_data->inputBlobsId.size() == 1 &&
1977                               inp_i_data->consumers.size() == 1)
1978                         {
1979                             pin = inp_i_data->inputBlobsId[0];
1980                             inp_i_data = &layers[pin.lid];
1981                         }
1982                         printf_(("\treal input for %s is %s\n",
1983                                layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
1984                                inp_i_data->getLayerInstance()->name.c_str()));
1985
1986                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
1987                             break;
1988                         realinputs[i] = pin;
1989                     }
1990
1991                     if( i >= ninputs )
1992                     {
1993                         // Allocate new memory to prevent collisions during memory
1994                         // reusing (see https://github.com/opencv/opencv/pull/10456).
1995                         output = output.clone();
1996                         if (preferableBackend == DNN_BACKEND_OPENCV &&
1997                             IS_DNN_OPENCL_TARGET(preferableTarget))
1998                         {
1999                             std::vector<UMat> umats(1);
2000                             umat_output = umat_output.clone();
2001                             umats[0] = umat_output;
2002                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
2003                         }
2004                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
2005                         int ofs = 0;
2006                         for( i = 0; i < ninputs; i++ )
2007                         {
2008                             LayerPin pin = realinputs[i];
2009                             LayerData* inp_i_data = &layers[pin.lid];
2010                             int channels_i = ld.inputBlobs[i]->size[1];
2011                             chrange[1] = Range(ofs, ofs + channels_i);
2012                             printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
2013                                    pin.oid, ofs, ofs + channels_i));
2014                             ofs += channels_i;
2015                             Mat output_slice = output(chrange);
2016                             Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
2017                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
2018                             Mat* oldPtr = &curr_output;
2019                             curr_output = output_slice;
2020                             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2021                             {
2022                                 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
2023                                 umats[pin.oid] = umat_output(chrange);
2024                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
2025                             }
2026                             // Layers that refer old input Mat will refer to the
2027                             // new data but the same Mat object.
2028                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
2029                         }
2030                         ld.skip = true;
2031                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
2032                     }
2033                 }
2034             }
2035         }
2036     }
2037
2038     void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
2039     {
2040         CV_TRACE_FUNCTION();
2041
2042         MapIdToLayerData::iterator it;
2043         for (it = layers.begin(); it != layers.end(); it++)
2044             it->second.flag = 0;
2045
2046         CV_Assert(!layers[0].outputBlobs.empty());
2047         ShapesVec inputShapes;
2048         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
2049         {
2050             Mat& inp = layers[0].outputBlobs[i];
2051             CV_Assert(inp.total());
2052             if (preferableBackend == DNN_BACKEND_OPENCV &&
2053                 preferableTarget == DNN_TARGET_OPENCL_FP16)
2054             {
2055                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
2056             }
2057             inputShapes.push_back(shape(inp));
2058         }
2059         LayersShapesMap layersShapes;
2060         getLayersShapes(inputShapes, layersShapes);
2061
2062         blobManager.reset();
2063         backendWrappers.clear();
2064         // Fake references to input blobs.
2065         for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
2066             blobManager.addReference(LayerPin(0, i));
2067         for (it = layers.begin(); it != layers.end(); ++it)
2068         {
2069             const LayerData& ld = it->second;
2070             blobManager.addReferences(ld.inputBlobsId);
2071         }
2072
2073         for (int i = 0; i < blobsToKeep_.size(); i++)
2074         {
2075             blobManager.addReference(blobsToKeep_[i]);
2076         }
2077
2078         for (it = layers.begin(); it != layers.end(); it++)
2079         {
2080             int lid = it->first;
2081             allocateLayer(lid, layersShapes);
2082         }
2083
2084         layersTimings.resize(lastLayerId + 1, 0);
2085         fuseLayers(blobsToKeep_);
2086     }
2087
2088     void forwardLayer(LayerData &ld)
2089     {
2090         CV_TRACE_FUNCTION();
2091
2092         Ptr<Layer> layer = ld.layerInstance;
2093
2094         TickMeter tm;
2095         tm.start();
2096
2097         if( !ld.skip )
2098         {
2099             std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
2100             if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
2101             {
2102                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2103                 {
2104                     std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
2105                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2106                     std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
2107                     layer->forward(umat_inputBlobs,
2108                                    umat_outputBlobs,
2109                                    umat_internalBlobs);
2110                     if (DNN_CHECK_NAN_INF)
2111                     {
2112                         bool fail = false;
2113                         for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2114                         {
2115                             UMat& u = umat_outputBlobs[i];
2116                             Mat m;
2117                             if (u.depth() == CV_16S) // FP16
2118                                 convertFp16(u, m);
2119                             else
2120                                 m = u.getMat(ACCESS_READ);
2121                             if (!checkRange(m))
2122                             {
2123                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2124                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2125                                 fail = true;
2126                             }
2127                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2128                             {
2129                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2130                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2131                                 fail = true;
2132                             }
2133                         }
2134                         if (fail)
2135                         {
2136                             for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
2137                             {
2138                                 UMat& u = umat_inputBlobs[i];
2139                                 Mat m;
2140                                 if (u.depth() == CV_16S) // FP16
2141                                     convertFp16(u, m);
2142                                 else
2143                                     m = u.getMat(ACCESS_READ);
2144                                 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2145                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2146                             }
2147                             for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2148                             {
2149                                 UMat& u = umat_outputBlobs[i];
2150                                 Mat m;
2151                                 if (u.depth() == CV_16S) // FP16
2152                                     convertFp16(u, m);
2153                                 else
2154                                     m = u.getMat(ACCESS_READ);
2155                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2156                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2157                             }
2158                             for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
2159                             {
2160                                 UMat& u = umat_internalBlobs[i];
2161                                 Mat m;
2162                                 if (u.depth() == CV_16S) // FP16
2163                                     convertFp16(u, m);
2164                                 else
2165                                     m = u.getMat(ACCESS_READ);
2166                                 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
2167                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
2168                             }
2169                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2170                                 CV_Assert(!fail);
2171                         }
2172                     }
2173                     OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
2174                 }
2175                 else
2176                 {
2177                     for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
2178                     {
2179                         if (!ld.inputBlobsWrappers[i].empty())
2180                             ld.inputBlobsWrappers[i]->copyToHost();
2181                     }
2182
2183                     std::vector<Mat> inps(ld.inputBlobs.size());
2184                     for (int i = 0; i < ld.inputBlobs.size(); ++i)
2185                     {
2186                         inps[i] = *ld.inputBlobs[i];
2187                     }
2188                     layer->forward(inps, ld.outputBlobs, ld.internals);
2189
2190                     if (DNN_CHECK_NAN_INF)
2191                     {
2192                         bool fail = false;
2193                         for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2194                         {
2195                             const Mat& m = ld.outputBlobs[i];
2196                             if (!checkRange(m))
2197                             {
2198                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2199                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2200                                 fail = true;
2201                             }
2202                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2203                             {
2204                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2205                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2206                                 fail = true;
2207                             }
2208                         }
2209                         if (fail)
2210                         {
2211                             for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
2212                             {
2213                                 const Mat* pM = ld.inputBlobs[i];
2214                                 if (!pM)
2215                                 {
2216                                     std::cout << "INPUT " << i << " is NULL" << std::endl;
2217                                     continue;
2218                                 }
2219                                 const Mat& m = *pM;
2220                                 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2221                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2222                             }
2223                             for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2224                             {
2225                                 const Mat& m = ld.outputBlobs[i];
2226                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2227                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2228                             }
2229                             for (size_t i = 0; i < ld.internals.size(); ++i)
2230                             {
2231                                 const Mat& m = ld.internals[i];
2232                                 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2233                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2234                             }
2235                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2236                                 CV_Assert(!fail);
2237                         }
2238                     }
2239
2240                     for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
2241                     {
2242                         if (!ld.outputBlobsWrappers[i].empty())
2243                             ld.outputBlobsWrappers[i]->setHostDirty();
2244                     }
2245                 }
2246             }
2247             else
2248             {
2249                 Ptr<BackendNode> node = it->second;
2250                 CV_Assert(!node.empty());
2251                 if (preferableBackend == DNN_BACKEND_HALIDE)
2252                 {
2253                     forwardHalide(ld.outputBlobsWrappers, node);
2254                 }
2255                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
2256                 {
2257                     forwardInfEngine(node);
2258                 }
2259                 else
2260                 {
2261                     CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
2262                 }
2263             }
2264         }
2265         else
2266             tm.reset();
2267
2268         tm.stop();
2269         layersTimings[ld.id] = tm.getTimeTicks();
2270
2271         ld.flag = 1;
2272     }
2273
2274     void forwardToLayer(LayerData &ld, bool clearFlags = true)
2275     {
2276         CV_TRACE_FUNCTION();
2277
2278         if (clearFlags)
2279         {
2280             MapIdToLayerData::iterator it;
2281             for (it = layers.begin(); it != layers.end(); it++)
2282                 it->second.flag = 0;
2283         }
2284
2285         //already was forwarded
2286         if (ld.flag)
2287             return;
2288
2289         //forward parents
2290         MapIdToLayerData::iterator it;
2291         for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
2292         {
2293             LayerData &ld = it->second;
2294             if (ld.flag)
2295                 continue;
2296             forwardLayer(ld);
2297         }
2298
2299         //forward itself
2300         forwardLayer(ld);
2301     }
2302
2303     void forwardAll()
2304     {
2305         CV_TRACE_FUNCTION();
2306
2307         MapIdToLayerData::reverse_iterator last_layer = layers.rbegin();
2308         CV_Assert(last_layer != layers.rend());
2309         forwardToLayer(last_layer->second, true);
2310     }
2311
2312     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
2313     {
2314         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
2315
2316         if (inOutShapes[id].in.empty())
2317         {
2318             for(int i = 0; i < inputLayerIds.size(); i++)
2319             {
2320                 int layerId = inputLayerIds[i].lid;
2321                 LayersShapesMap::iterator it =
2322                         inOutShapes.find(layerId);
2323                 if(it == inOutShapes.end() ||
2324                         it->second.out.empty())
2325                 {
2326                     getLayerShapesRecursively(layerId, inOutShapes);
2327                 }
2328                 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
2329                 inOutShapes[id].in.push_back(shape);
2330             }
2331         }
2332         const ShapesVec& is = inOutShapes[id].in;
2333         ShapesVec& os = inOutShapes[id].out;
2334         ShapesVec& ints = inOutShapes[id].internal;
2335         int requiredOutputs = layers[id].requiredOutputs.size();
2336         inOutShapes[id].supportInPlace =
2337                 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
2338     }
2339
2340     void getLayersShapes(const ShapesVec& netInputShapes,
2341                          LayersShapesMap& inOutShapes)
2342     {
2343         inOutShapes.clear();
2344
2345         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2346         for (MapIdToLayerData::iterator it = layers.begin();
2347              it != layers.end(); it++)
2348         {
2349             getLayerShapesRecursively(it->first, inOutShapes);
2350         }
2351     }
2352
2353     void getLayerShapes(const ShapesVec& netInputShapes,
2354                         const int layerId,
2355                         LayerShapes& shapes)
2356     {
2357         LayersShapesMap inOutShapes;
2358         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2359         getLayerShapesRecursively(layerId, inOutShapes);
2360         shapes = inOutShapes[layerId];
2361     }
2362
2363     LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
2364     {
2365         return *std::max_element(pins.begin(), pins.end());
2366     }
2367
2368     Mat getBlob(const LayerPin& pin)
2369     {
2370         CV_TRACE_FUNCTION();
2371
2372         if (!pin.valid())
2373             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2374
2375         LayerData &ld = layers[pin.lid];
2376         if ((size_t)pin.oid >= ld.outputBlobs.size())
2377         {
2378             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
2379                                            "the #%d was requested", ld.name.c_str(),
2380                                            ld.outputBlobs.size(), pin.oid));
2381         }
2382         if (preferableTarget != DNN_TARGET_CPU)
2383         {
2384             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2385             // Transfer data to CPU if it's require.
2386             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2387         }
2388
2389         if (ld.outputBlobs[pin.oid].depth() == CV_16S)
2390         {
2391             convertFp16(ld.outputBlobs[pin.oid], output_blob);
2392             return output_blob;
2393         }
2394         else
2395             return ld.outputBlobs[pin.oid];
2396     }
2397
2398     Mat getBlob(String outputName)
2399     {
2400         return getBlob(getPinByAlias(outputName));
2401     }
2402 };
2403
2404 Net::Net() : impl(new Net::Impl)
2405 {
2406 }
2407
2408 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
2409 {
2410 #ifndef HAVE_INF_ENGINE
2411     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
2412 #else
2413     InferenceEngine::CNNNetReader reader;
2414     reader.ReadNetwork(xml);
2415     reader.ReadWeights(bin);
2416
2417     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
2418
2419     std::vector<String> inputsNames;
2420     for (auto& it : ieNet.getInputsInfo())
2421     {
2422         inputsNames.push_back(it.first);
2423     }
2424
2425     Net cvNet;
2426     cvNet.setInputsNames(inputsNames);
2427
2428     Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(0));
2429     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
2430     for (auto& it : ieNet.getOutputsInfo())
2431     {
2432         Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second));
2433         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
2434         CV_Assert(ieLayer);
2435
2436         LayerParams lp;
2437         int lid = cvNet.addLayer(it.first, "", lp);
2438
2439         LayerData& ld = cvNet.impl->layers[lid];
2440         cvLayer->name = it.first;
2441         cvLayer->type = ieLayer->type;
2442         ld.layerInstance = cvLayer;
2443         ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
2444
2445         for (int i = 0; i < inputsNames.size(); ++i)
2446             cvNet.connect(0, i, lid, i);
2447     }
2448     cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
2449
2450     cvNet.impl->skipInfEngineInit = true;
2451     return cvNet;
2452 #endif  // HAVE_INF_ENGINE
2453 }
2454
2455 Net::~Net()
2456 {
2457 }
2458
2459 int Net::addLayer(const String &name, const String &type, LayerParams &params)
2460 {
2461     CV_TRACE_FUNCTION();
2462
2463     if (impl->getLayerId(name) >= 0)
2464     {
2465         CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
2466         return -1;
2467     }
2468
2469     int id = ++impl->lastLayerId;
2470     impl->layerNameToId.insert(std::make_pair(name, id));
2471     impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
2472
2473     return id;
2474 }
2475
2476 int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
2477 {
2478     CV_TRACE_FUNCTION();
2479
2480     int prvLid = impl->lastLayerId;
2481     int newLid = this->addLayer(name, type, params);
2482     this->connect(prvLid, 0, newLid, 0);
2483     return newLid;
2484 }
2485
2486 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
2487 {
2488     CV_TRACE_FUNCTION();
2489
2490     impl->connect(outLayerId, outNum, inpLayerId, inpNum);
2491 }
2492
2493 void Net::connect(String _outPin, String _inPin)
2494 {
2495     CV_TRACE_FUNCTION();
2496
2497     LayerPin outPin = impl->getPinByAlias(_outPin);
2498     LayerPin inpPin = impl->getPinByAlias(_inPin);
2499
2500     CV_Assert(outPin.valid() && inpPin.valid());
2501
2502     impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
2503 }
2504
2505 Mat Net::forward(const String& outputName)
2506 {
2507     CV_TRACE_FUNCTION();
2508
2509     String layerName = outputName;
2510
2511     if (layerName.empty())
2512         layerName = getLayerNames().back();
2513
2514     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2515     impl->setUpNet(pins);
2516     impl->forwardToLayer(impl->getLayerData(layerName));
2517
2518     return impl->getBlob(layerName);
2519 }
2520
2521 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
2522 {
2523     CV_TRACE_FUNCTION();
2524
2525     String layerName = outputName;
2526
2527     if (layerName.empty())
2528         layerName = getLayerNames().back();
2529
2530     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2531     impl->setUpNet(pins);
2532     impl->forwardToLayer(impl->getLayerData(layerName));
2533
2534     LayerPin pin = impl->getPinByAlias(layerName);
2535     LayerData &ld = impl->layers[pin.lid];
2536
2537     if (outputBlobs.isUMat())
2538     {
2539         impl->getBlob(layerName).copyTo(outputBlobs);
2540     }
2541     else if (outputBlobs.isMat())
2542     {
2543         outputBlobs.assign(impl->getBlob(layerName));
2544     }
2545     else if (outputBlobs.isMatVector())
2546     {
2547         if (impl->preferableTarget != DNN_TARGET_CPU)
2548         {
2549             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2550             {
2551                 CV_Assert(!ld.outputBlobsWrappers[i].empty());
2552                 ld.outputBlobsWrappers[i]->copyToHost();
2553             }
2554         }
2555         if (ld.outputBlobs[0].depth() == CV_32F)
2556         {
2557             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2558             outputvec = ld.outputBlobs;
2559         } else {
2560             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2561             outputvec.resize(ld.outputBlobs.size());
2562             for (int i = 0; i < outputvec.size(); i++)
2563                 convertFp16(ld.outputBlobs[i], outputvec[i]);
2564         }
2565     }
2566     else if (outputBlobs.isUMatVector())
2567     {
2568         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
2569
2570         if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
2571             IS_DNN_OPENCL_TARGET(impl->preferableTarget))
2572         {
2573             if (impl->preferableTarget == DNN_TARGET_OPENCL)
2574                 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2575             else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
2576             {
2577                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2578                 outputvec.resize(out_vec.size());
2579                 for (int i = 0; i < out_vec.size(); i++)
2580                     convertFp16(out_vec[i], outputvec[i]);
2581             }
2582         }
2583         else
2584         {
2585             outputvec.resize(ld.outputBlobs.size());
2586             for (int i = 0; i < outputvec.size(); ++i)
2587                 ld.outputBlobs[i].copyTo(outputvec[i]);
2588         }
2589     }
2590 }
2591
2592 void Net::forward(OutputArrayOfArrays outputBlobs,
2593                   const std::vector<String>& outBlobNames)
2594 {
2595     CV_TRACE_FUNCTION();
2596
2597     std::vector<LayerPin> pins;
2598     for (int i = 0; i < outBlobNames.size(); i++)
2599     {
2600         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
2601     }
2602
2603     impl->setUpNet(pins);
2604
2605     LayerPin out = impl->getLatestLayerPin(pins);
2606
2607     impl->forwardToLayer(impl->getLayerData(out.lid));
2608
2609     std::vector<Mat> matvec;
2610     for (int i = 0; i < pins.size(); i++)
2611     {
2612         matvec.push_back(impl->getBlob(pins[i]));
2613     }
2614
2615     std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2616     outputvec = matvec;
2617 }
2618
2619 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
2620                      const std::vector<String>& outBlobNames)
2621 {
2622     CV_TRACE_FUNCTION();
2623
2624     std::vector<LayerPin> pins;
2625     for (int i = 0; i < outBlobNames.size(); i++)
2626     {
2627         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
2628         pins.insert(pins.end(), lp.begin(), lp.end());
2629     }
2630
2631     impl->setUpNet(pins);
2632
2633     LayerPin out = impl->getLatestLayerPin(pins);
2634
2635     impl->forwardToLayer(impl->getLayerData(out.lid));
2636
2637     outputBlobs.resize(outBlobNames.size());
2638     for (int i = 0; i < outBlobNames.size(); i++)
2639     {
2640         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
2641         for (int i = 0; i < lp.size(); i++)
2642         {
2643             outputBlobs[i].push_back(impl->getBlob(lp[i]));
2644         }
2645     }
2646 }
2647
2648 void Net::setPreferableBackend(int backendId)
2649 {
2650     CV_TRACE_FUNCTION();
2651     CV_TRACE_ARG(backendId);
2652
2653     if( impl->preferableBackend != backendId )
2654     {
2655         impl->preferableBackend = backendId;
2656         impl->netWasAllocated = false;
2657         impl->clear();
2658     }
2659 }
2660
2661 void Net::setPreferableTarget(int targetId)
2662 {
2663     CV_TRACE_FUNCTION();
2664     CV_TRACE_ARG(targetId);
2665
2666     if( impl->preferableTarget != targetId )
2667     {
2668         impl->preferableTarget = targetId;
2669         if (IS_DNN_OPENCL_TARGET(targetId))
2670         {
2671 #ifndef HAVE_OPENCL
2672 #ifdef HAVE_INF_ENGINE
2673             if (impl->preferableBackend == DNN_BACKEND_OPENCV)
2674 #else
2675             if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
2676                 impl->preferableBackend == DNN_BACKEND_OPENCV)
2677 #endif  // HAVE_INF_ENGINE
2678                 impl->preferableTarget = DNN_TARGET_CPU;
2679 #else
2680             bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
2681             if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
2682                 impl->preferableTarget = DNN_TARGET_OPENCL;
2683 #endif
2684         }
2685         impl->netWasAllocated = false;
2686         impl->clear();
2687     }
2688 }
2689
2690 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
2691 {
2692     CV_TRACE_FUNCTION();
2693
2694     impl->netInputLayer->setNames(inputBlobNames);
2695 }
2696
2697 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
2698 {
2699     CV_TRACE_FUNCTION();
2700     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
2701
2702     LayerPin pin;
2703     pin.lid = 0;
2704     pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
2705
2706     if (!pin.valid())
2707         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
2708
2709     LayerData &ld = impl->layers[pin.lid];
2710     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
2711     ld.outputBlobs.resize(numInputs);
2712     ld.outputBlobsWrappers.resize(numInputs);
2713     impl->netInputLayer->inputsData.resize(numInputs);
2714     impl->netInputLayer->scaleFactors.resize(numInputs);
2715     impl->netInputLayer->means.resize(numInputs);
2716
2717     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
2718     Mat blob_ = blob.getMat();
2719     bool oldShape = prevShape == shape(blob_);
2720     if (oldShape)
2721     {
2722         blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
2723     }
2724     else
2725     {
2726         ld.outputBlobs[pin.oid] = blob_.clone();
2727         impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
2728     }
2729
2730     if (!ld.outputBlobsWrappers[pin.oid].empty())
2731     {
2732         ld.outputBlobsWrappers[pin.oid]->setHostDirty();
2733     }
2734     impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
2735     impl->netInputLayer->means[pin.oid] = mean;
2736     impl->netWasAllocated = impl->netWasAllocated && oldShape;
2737 }
2738
2739 Mat Net::getParam(LayerId layer, int numParam)
2740 {
2741     LayerData &ld = impl->getLayerData(layer);
2742     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
2743     CV_Assert(numParam < (int)layerBlobs.size());
2744     return layerBlobs[numParam];
2745 }
2746
2747 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
2748 {
2749     LayerData &ld = impl->getLayerData(layer);
2750
2751     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
2752     CV_Assert(numParam < (int)layerBlobs.size());
2753     //we don't make strong checks, use this function carefully
2754     layerBlobs[numParam] = blob;
2755 }
2756
2757 int Net::getLayerId(const String &layer)
2758 {
2759     return impl->getLayerId(layer);
2760 }
2761
2762 Ptr<Layer> Net::getLayer(LayerId layerId)
2763 {
2764     LayerData &ld = impl->getLayerData(layerId);
2765     return ld.getLayerInstance();
2766 }
2767
2768 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
2769 {
2770     LayerData &ld = impl->getLayerData(layerId);
2771     if (!ld.layerInstance)
2772         CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str()));
2773
2774     std::vector<Ptr<Layer> > inputLayers;
2775     inputLayers.reserve(ld.inputLayersId.size());
2776     std::set<int>::iterator it;
2777     for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) {
2778         inputLayers.push_back(getLayer(*it));
2779     }
2780     return inputLayers;
2781 }
2782
2783 std::vector<String> Net::getLayerNames() const
2784 {
2785     std::vector<String> res;
2786     res.reserve(impl->layers.size());
2787
2788     Impl::MapIdToLayerData::iterator it;
2789     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
2790     {
2791         if (it->second.id) //skip Data layer
2792             res.push_back(it->second.name);
2793     }
2794
2795     return res;
2796 }
2797
2798 bool Net::empty() const
2799 {
2800     return impl->layers.size() <= 1; //first layer is default Data layer
2801 }
2802
2803 std::vector<int> Net::getUnconnectedOutLayers() const
2804 {
2805     std::vector<int> layersIds;
2806
2807     Impl::MapIdToLayerData::iterator it;
2808     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
2809     {
2810         int lid = it->first;
2811         LayerData &ld = it->second;
2812
2813         if (ld.requiredOutputs.size() == 0)
2814             layersIds.push_back(lid);
2815     }
2816
2817     return layersIds;
2818 }
2819
2820 std::vector<String> Net::getUnconnectedOutLayersNames() const
2821 {
2822     std::vector<int> ids = getUnconnectedOutLayers();
2823     const size_t n = ids.size();
2824     std::vector<String> names(n);
2825     for (size_t i = 0; i < n; ++i)
2826     {
2827         names[i] = impl->layers[ids[i]].name;
2828     }
2829     return names;
2830 }
2831
2832 void Net::getLayersShapes(const ShapesVec& netInputShapes,
2833                           std::vector<int>& layersIds,
2834                           std::vector<ShapesVec>& inLayersShapes,
2835                           std::vector<ShapesVec>& outLayersShapes) const
2836 {
2837     layersIds.clear();
2838     inLayersShapes.clear();
2839     outLayersShapes.clear();
2840
2841     Impl::LayersShapesMap inOutShapes;
2842     impl->getLayersShapes(netInputShapes, inOutShapes);
2843
2844     for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
2845         it != inOutShapes.end(); it++)
2846     {
2847         layersIds.push_back(it->first);
2848         inLayersShapes.push_back(it->second.in);
2849         outLayersShapes.push_back(it->second.out);
2850     }
2851 }
2852
2853 void Net::getLayersShapes(const MatShape& netInputShape,
2854                           std::vector<int>& layerIds,
2855                           std::vector<ShapesVec>& inLayersShapes,
2856                           std::vector<ShapesVec>& outLayersShapes) const
2857 {
2858     getLayersShapes(ShapesVec(1, netInputShape),
2859                     layerIds, inLayersShapes, outLayersShapes);
2860 }
2861
2862 void Net::getLayerShapes(const MatShape& netInputShape,
2863                          const int layerId,
2864                          ShapesVec& inLayerShapes,
2865                          ShapesVec& outLayerShapes) const
2866 {
2867     getLayerShapes(ShapesVec(1, netInputShape),
2868                    layerId, inLayerShapes, outLayerShapes);
2869
2870 }
2871
2872 void Net::getLayerShapes(const ShapesVec& netInputShapes,
2873                     const int layerId,
2874                     ShapesVec& inLayerShapes,
2875                     ShapesVec& outLayerShapes) const
2876 {
2877     LayerShapes shapes;
2878     impl->getLayerShapes(netInputShapes, layerId, shapes);
2879     inLayerShapes = shapes.in;
2880     outLayerShapes = shapes.out;
2881 }
2882
2883 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
2884 {
2885     CV_TRACE_FUNCTION();
2886
2887     int64 flops = 0;
2888     std::vector<int> ids;
2889     std::vector<std::vector<MatShape> > inShapes, outShapes;
2890     getLayersShapes(netInputShapes, ids, inShapes, outShapes);
2891     CV_Assert(inShapes.size() == outShapes.size());
2892     CV_Assert(inShapes.size() == ids.size());
2893
2894     for(int i = 0; i < ids.size(); i++)
2895     {
2896         flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
2897                                                                    outShapes[i]);
2898     }
2899
2900     return flops;
2901 }
2902
2903 int64 Net::getFLOPS(const MatShape& netInputShape) const
2904 {
2905     return getFLOPS(std::vector<MatShape>(1, netInputShape));
2906 }
2907
2908 int64 Net::getFLOPS(const int layerId,
2909               const std::vector<MatShape>& netInputShapes) const
2910 {
2911     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
2912     CV_Assert(layer != impl->layers.end());
2913
2914     LayerShapes shapes;
2915     impl->getLayerShapes(netInputShapes, layerId, shapes);
2916
2917     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
2918 }
2919
2920 int64 Net::getFLOPS(const int layerId,
2921               const MatShape& netInputShape) const
2922 {
2923     return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
2924 }
2925
2926 void Net::getLayerTypes(std::vector<String>& layersTypes) const
2927 {
2928     layersTypes.clear();
2929
2930     std::map<String, int> layers;
2931     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
2932          it != impl->layers.end(); it++)
2933     {
2934         if (layers.find(it->second.type) == layers.end())
2935             layers[it->second.type] = 0;
2936         layers[it->second.type]++;
2937     }
2938
2939     for (std::map<String, int>::iterator it = layers.begin();
2940          it != layers.end(); it++)
2941     {
2942         layersTypes.push_back(it->first);
2943     }
2944 }
2945
2946 int Net::getLayersCount(const String& layerType) const
2947 {
2948     int count = 0;
2949     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
2950          it != impl->layers.end(); it++)
2951     {
2952         if (it->second.type == layerType)
2953             count++;
2954     }
2955     return count;
2956 }
2957
2958 void Net::getMemoryConsumption(const int layerId,
2959                                const std::vector<MatShape>& netInputShapes,
2960                                size_t& weights, size_t& blobs) const
2961 {
2962     CV_TRACE_FUNCTION();
2963
2964     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
2965     CV_Assert(layer != impl->layers.end());
2966
2967     weights = blobs = 0;
2968
2969     for(int i = 0; i < layer->second.params.blobs.size(); i++)
2970     {
2971         const Mat& weightsBlob = layer->second.params.blobs[i];
2972         weights += weightsBlob.total()*weightsBlob.elemSize();
2973     }
2974
2975     ShapesVec inLayerShapes, outLayerShapes;
2976     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
2977     for(int i = 0; i < outLayerShapes.size(); i++)
2978     {
2979         blobs += total(outLayerShapes[i]) * sizeof(float);
2980     }
2981 }
2982
2983 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
2984                                size_t& weights, size_t& blobs) const
2985 {
2986     CV_TRACE_FUNCTION();
2987
2988     std::vector<int> layerIds;
2989     std::vector<size_t> w, b;
2990     getMemoryConsumption(netInputShapes, layerIds, w, b);
2991
2992     weights = blobs = 0;
2993     for(int i = 0; i < layerIds.size(); i++)
2994     {
2995         weights += w[i];
2996         blobs += b[i];
2997     }
2998 }
2999
3000 void Net::getMemoryConsumption(const int layerId,
3001                                const MatShape& netInputShape,
3002                                size_t& weights, size_t& blobs) const
3003 {
3004     getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
3005                          weights, blobs);
3006 }
3007
3008 void Net::getMemoryConsumption(const MatShape& netInputShape,
3009                                size_t& weights, size_t& blobs) const
3010 {
3011     getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
3012                          weights, blobs);
3013 }
3014
3015 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3016                                   std::vector<int>& layerIds, std::vector<size_t>& weights,
3017                                   std::vector<size_t>& blobs) const
3018 {
3019     CV_TRACE_FUNCTION();
3020
3021     layerIds.clear();
3022     weights.clear();
3023     blobs.clear();
3024
3025     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
3026
3027     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
3028
3029     for(int i = 0; i < layerIds.size(); i++)
3030     {
3031         int w = 0, b = 0;
3032         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
3033         CV_Assert(layer != impl->layers.end());
3034
3035         for(int j = 0; j < layer->second.params.blobs.size(); j++)
3036         {
3037             const Mat& weightsBlob = layer->second.params.blobs[j];
3038             w += weightsBlob.total()*weightsBlob.elemSize();
3039         }
3040
3041         for(int j = 0; j < outLayerShapes[i].size(); j++)
3042         {
3043             b += total(outLayerShapes[i][j]) * sizeof(float);
3044         }
3045
3046         weights.push_back(w);
3047         blobs.push_back(b);
3048     }
3049 }
3050
3051 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
3052                                std::vector<size_t>& weights, std::vector<size_t>& blobs) const
3053 {
3054     getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
3055                          weights, blobs);
3056 }
3057
3058 void Net::enableFusion(bool fusion)
3059 {
3060     if( impl->fusion != fusion )
3061     {
3062         impl->fusion = fusion;
3063         impl->netWasAllocated = false;
3064         impl->clear();
3065     }
3066 }
3067
3068 void Net::setHalideScheduler(const String& scheduler)
3069 {
3070     CV_TRACE_FUNCTION();
3071     CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
3072
3073     impl->halideConfigFile = scheduler;
3074 }
3075
3076 int64 Net::getPerfProfile(std::vector<double>& timings)
3077 {
3078     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
3079     int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
3080     return total;
3081 }
3082
3083 //////////////////////////////////////////////////////////////////////////
3084
3085 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
3086
3087 Layer::Layer(const LayerParams &params)
3088     : blobs(params.blobs), name(params.name), type(params.type)
3089 {
3090     preferableTarget = DNN_TARGET_CPU;
3091 }
3092
3093 void Layer::setParamsFrom(const LayerParams &params)
3094 {
3095     blobs = params.blobs;
3096     name = params.name;
3097     type = params.type;
3098 }
3099
3100 int Layer::inputNameToIndex(String)
3101 {
3102     return -1;
3103 }
3104
3105 int Layer::outputNameToIndex(const String&)
3106 {
3107     return 0;
3108 }
3109
3110 bool Layer::supportBackend(int backendId)
3111 {
3112     return backendId == DNN_BACKEND_OPENCV;
3113 }
3114
3115 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
3116 {
3117     CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
3118                                        " layers is not defined.");
3119     return Ptr<BackendNode>();
3120 }
3121
3122 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
3123 {
3124     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
3125                                        " layers is not defined.");
3126     return Ptr<BackendNode>();
3127 }
3128
3129 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
3130                                  const std::vector<Mat> &outputs, int targetId) const
3131 {
3132 #ifdef  HAVE_HALIDE
3133     CV_TRACE_FUNCTION();
3134
3135     Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
3136                 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
3137     Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
3138
3139     int outW, outH, outC, outN;
3140     getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
3141
3142     if (targetId == DNN_TARGET_CPU)
3143     {
3144         if (outW == 1 && outH == 1)
3145         {
3146             if (outC + outN == 1)
3147                 return;
3148
3149             if (outC > 8)
3150               top.split(c, co, ci, 8)
3151                  .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3152                  .parallel(tile)
3153                  .vectorize(ci, 8);
3154             else
3155               top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
3156                  .parallel(tile);
3157         }
3158         else
3159         {
3160             if (outH > 2)
3161             {
3162                 top.reorder(x, c, y)
3163                    .split(y, yo, yi, 2)
3164                    .fuse(yo, n, tile)
3165                    .parallel(tile)
3166                    .unroll(yi)
3167                    .vectorize(x, outW >= 16 ? 16 : outW);
3168             }
3169         }
3170     }
3171     else if (targetId == DNN_TARGET_OPENCL)
3172     {
3173         if (outW == 1 && outH == 1)
3174         {
3175             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
3176             top.split(c, co, ci, c_split)
3177                .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3178                .gpu_blocks(tile)
3179                .gpu_threads(ci);
3180         }
3181         else
3182         {
3183             int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
3184             int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
3185             // Supported vectorization widths: 2, 3, 4, 8, 16
3186             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
3187             top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
3188                .split(c, co, ci, c_split)
3189                .gpu_blocks(xo, yo, co)
3190                .gpu_threads(xi, yi)
3191                .reorder(xi, yi, ci, xo, yo, co)
3192                .vectorize(ci);
3193         }
3194     }
3195     else
3196         CV_Error(Error::StsNotImplemented, "Unknown target identifier");
3197 #endif  // HAVE_HALIDE
3198 }
3199
3200 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
3201 {
3202     return Ptr<BackendNode>();
3203 }
3204
3205 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
3206 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
3207 void Layer::getScaleShift(Mat& scale, Mat& shift) const
3208 {
3209     scale = Mat();
3210     shift = Mat();
3211 }
3212
3213 void Layer::unsetAttached()
3214 {
3215     setActivation(Ptr<ActivationLayer>());
3216 }
3217
3218 template <typename T>
3219 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
3220 {
3221     pv.resize(v.size());
3222     for (size_t i = 0; i < v.size(); i++)
3223         pv[i] = const_cast<T*>(&v[i]);
3224 }
3225
3226 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
3227 {
3228     CV_TRACE_FUNCTION();
3229     this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
3230 }
3231
3232 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
3233 {
3234     CV_UNUSED(input);CV_UNUSED(output);
3235 }
3236
3237 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
3238 {
3239     CV_TRACE_FUNCTION();
3240     std::vector<Mat> inputs, outputs;
3241     inputs_arr.getMatVector(inputs);
3242     outputs_arr.getMatVector(outputs);
3243
3244     std::vector<Mat*> inputsp;
3245     vecToPVec(inputs, inputsp);
3246     this->finalize(inputsp, outputs);
3247 }
3248
3249 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
3250 {
3251     CV_TRACE_FUNCTION();
3252
3253     std::vector<Mat> outputs;
3254     this->finalize(inputs, outputs);
3255     return outputs;
3256 }
3257
3258 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
3259 {
3260     // We kept this method for compatibility. DNN calls it now only to support users' implementations.
3261 }
3262
3263 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3264 {
3265     CV_TRACE_FUNCTION();
3266     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3267
3268     Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
3269 }
3270
3271 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3272 {
3273     CV_TRACE_FUNCTION();
3274     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3275
3276     if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
3277     {
3278         std::vector<UMat> inputs;
3279         std::vector<UMat> outputs;
3280         std::vector<UMat> internals;
3281
3282         std::vector<UMat> orig_inputs;
3283         std::vector<UMat> orig_outputs;
3284         std::vector<UMat> orig_internals;
3285
3286         inputs_arr.getUMatVector(orig_inputs);
3287         outputs_arr.getUMatVector(orig_outputs);
3288         internals_arr.getUMatVector(orig_internals);
3289
3290         inputs.resize(orig_inputs.size());
3291         for (size_t i = 0; i < orig_inputs.size(); i++)
3292             convertFp16(orig_inputs[i], inputs[i]);
3293
3294         outputs.resize(orig_outputs.size());
3295         for (size_t i = 0; i < orig_outputs.size(); i++)
3296             outputs[i].create(shape(orig_outputs[i]), CV_32F);
3297
3298         internals.resize(orig_internals.size());
3299         for (size_t i = 0; i < orig_internals.size(); i++)
3300             internals[i].create(shape(orig_internals[i]), CV_32F);
3301
3302         forward(inputs, outputs, internals);
3303
3304         for (size_t i = 0; i < outputs.size(); i++)
3305             convertFp16(outputs[i], orig_outputs[i]);
3306
3307         // sync results back
3308         outputs_arr.assign(orig_outputs);
3309         internals_arr.assign(orig_internals);
3310         return;
3311     }
3312     std::vector<Mat> inpvec;
3313     std::vector<Mat> outputs;
3314     std::vector<Mat> internals;
3315
3316     inputs_arr.getMatVector(inpvec);
3317     outputs_arr.getMatVector(outputs);
3318     internals_arr.getMatVector(internals);
3319
3320     std::vector<Mat*> inputs(inpvec.size());
3321     for (int i = 0; i < inpvec.size(); i++)
3322         inputs[i] = &inpvec[i];
3323
3324     this->forward(inputs, outputs, internals);
3325
3326     // sync results back
3327     outputs_arr.assign(outputs);
3328     internals_arr.assign(internals);
3329 }
3330
3331 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
3332 {
3333     CV_TRACE_FUNCTION();
3334
3335     this->finalize(inputs, outputs);
3336     this->forward(inputs, outputs, internals);
3337 }
3338
3339 Layer::~Layer() {}
3340
3341 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
3342                             const int requiredOutputs,
3343                             std::vector<MatShape> &outputs,
3344                             std::vector<MatShape> &internals) const
3345 {
3346     CV_Assert(inputs.size());
3347     outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
3348     return false;
3349 }
3350
3351 //////////////////////////////////////////////////////////////////////////
3352
3353 static Mutex& getLayerFactoryMutex()
3354 {
3355     static Mutex* volatile instance = NULL;
3356     if (instance == NULL)
3357     {
3358         cv::AutoLock lock(getInitializationMutex());
3359         if (instance == NULL)
3360             instance = new Mutex();
3361     }
3362     return *instance;
3363 }
3364
3365 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
3366
3367 static LayerFactory_Impl& getLayerFactoryImpl_()
3368 {
3369     static LayerFactory_Impl impl;
3370     return impl;
3371 }
3372
3373 static LayerFactory_Impl& getLayerFactoryImpl()
3374 {
3375     static LayerFactory_Impl* volatile instance = NULL;
3376     if (instance == NULL)
3377     {
3378         cv::AutoLock lock(getLayerFactoryMutex());
3379         if (instance == NULL)
3380         {
3381             instance = &getLayerFactoryImpl_();
3382             initializeLayerFactory();
3383         }
3384     }
3385     return *instance;
3386 }
3387
3388 void LayerFactory::registerLayer(const String &type, Constructor constructor)
3389 {
3390     CV_TRACE_FUNCTION();
3391     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3392
3393     cv::AutoLock lock(getLayerFactoryMutex());
3394     String type_ = type.toLowerCase();
3395     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3396
3397     if (it != getLayerFactoryImpl().end())
3398     {
3399         if (it->second.back() == constructor)
3400             CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered");
3401         it->second.push_back(constructor);
3402     }
3403     getLayerFactoryImpl().insert(std::make_pair(type_, std::vector<Constructor>(1, constructor)));
3404 }
3405
3406 void LayerFactory::unregisterLayer(const String &type)
3407 {
3408     CV_TRACE_FUNCTION();
3409     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3410
3411     cv::AutoLock lock(getLayerFactoryMutex());
3412     String type_ = type.toLowerCase();
3413
3414     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3415     if (it != getLayerFactoryImpl().end())
3416     {
3417         if (it->second.size() > 1)
3418             it->second.pop_back();
3419         else
3420             getLayerFactoryImpl().erase(it);
3421     }
3422 }
3423
3424 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
3425 {
3426     CV_TRACE_FUNCTION();
3427     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3428
3429     cv::AutoLock lock(getLayerFactoryMutex());
3430     String type_ = type.toLowerCase();
3431     LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_);
3432
3433     if (it != getLayerFactoryImpl().end())
3434     {
3435         CV_Assert(!it->second.empty());
3436         return it->second.back()(params);
3437     }
3438     else
3439     {
3440         return Ptr<Layer>(); //NULL
3441     }
3442 }
3443
3444 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
3445
3446 BackendNode::~BackendNode() {};
3447
3448 BackendWrapper::BackendWrapper(int backendId, int targetId)
3449     : backendId(backendId), targetId(targetId) {}
3450
3451 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
3452 {
3453     CV_Error(Error::StsNotImplemented,
3454              "Constructor of backend wrapper must be implemented");
3455 }
3456
3457 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
3458 {
3459     CV_Error(Error::StsNotImplemented,
3460              "Constructor of backend wrapper must be implemented");
3461 }
3462
3463 BackendWrapper::~BackendWrapper() {}
3464
3465 Net readNet(const String& _model, const String& _config, const String& _framework)
3466 {
3467     String framework = _framework.toLowerCase();
3468     String model = _model;
3469     String config = _config;
3470     const std::string modelExt = model.substr(model.rfind('.') + 1);
3471     const std::string configExt = config.substr(config.rfind('.') + 1);
3472     if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
3473                                 modelExt == "prototxt" || configExt == "prototxt")
3474     {
3475         if (modelExt == "prototxt" || configExt == "caffemodel")
3476             std::swap(model, config);
3477         return readNetFromCaffe(config, model);
3478     }
3479     if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
3480                                      modelExt == "pbtxt" || configExt == "pbtxt")
3481     {
3482         if (modelExt == "pbtxt" || configExt == "pb")
3483             std::swap(model, config);
3484         return readNetFromTensorflow(model, config);
3485     }
3486     if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
3487                                 configExt == "t7" || configExt == "net")
3488     {
3489         return readNetFromTorch(model.empty() ? config : model);
3490     }
3491     if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
3492                                   modelExt == "cfg" || configExt == "cfg")
3493     {
3494         if (modelExt == "cfg" || configExt == "weights")
3495             std::swap(model, config);
3496         return readNetFromDarknet(config, model);
3497     }
3498     if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
3499                                modelExt == "xml" || configExt == "xml")
3500     {
3501         if (modelExt == "xml" || configExt == "bin")
3502             std::swap(model, config);
3503         return readNetFromModelOptimizer(config, model);
3504     }
3505     if (framework == "onnx" || modelExt == "onnx")
3506     {
3507         return readNetFromONNX(model);
3508     }
3509     CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
3510                                       model + (config.empty() ? "" : ", " + config));
3511 }
3512
3513 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
3514             const std::vector<uchar>& bufferConfig)
3515 {
3516     String framework = _framework.toLowerCase();
3517     if (framework == "caffe")
3518         return readNetFromCaffe(bufferConfig, bufferModel);
3519     else if (framework == "tensorflow")
3520         return readNetFromTensorflow(bufferModel, bufferConfig);
3521     else if (framework == "darknet")
3522         return readNetFromDarknet(bufferConfig, bufferModel);
3523     else if (framework == "torch")
3524         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
3525     else if (framework == "dldt")
3526         CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
3527     CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
3528 }
3529
3530 Net readNetFromModelOptimizer(const String &xml, const String &bin)
3531 {
3532     return Net::readFromModelOptimizer(xml, bin);
3533 }
3534
3535 CV__DNN_EXPERIMENTAL_NS_END
3536 }} // namespace