1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
19 // * Redistribution's of source code must retain the above copyright notice,
20 // this list of conditions and the following disclaimer.
22 // * Redistribution's in binary form must reproduce the above copyright notice,
23 // this list of conditions and the following disclaimer in the documentation
24 // and/or other materials provided with the distribution.
26 // * The name of the copyright holders may not be used to endorse or promote products
27 // derived from this software without specific prior written permission.
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
42 #include "precomp.hpp"
43 #include "op_halide.hpp"
44 #include "op_inf_engine.hpp"
45 #include "op_vkcom.hpp"
46 #include "op_cuda.hpp"
47 #include "halide_scheduler.hpp"
57 #include <opencv2/dnn/shape_utils.hpp>
58 #include <opencv2/imgproc.hpp>
60 #include <opencv2/core/utils/configuration.private.hpp>
61 #include <opencv2/core/utils/logger.hpp>
63 #include <opencv2/core/cuda.hpp>
67 CV__DNN_INLINE_NS_BEGIN
69 // this option is useful to run valgrind memory errors detection
70 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
73 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
76 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
77 #ifdef HAVE_INF_ENGINE
78 (size_t)DNN_BACKEND_INFERENCE_ENGINE
80 (size_t)DNN_BACKEND_OPENCV
84 // Additional checks (slowdowns execution!)
85 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
86 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
87 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
94 //==================================================================================================
99 typedef std::vector< std::pair<Backend, Target> > BackendsList;
100 const BackendsList & getBackends() const { return backends; }
101 static BackendRegistry & getRegistry()
103 static BackendRegistry impl;
110 backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_CPU));
112 if (cv::ocl::useOpenCL())
113 backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL));
115 #endif // HAVE_HALIDE
117 #ifdef HAVE_INF_ENGINE
118 if (checkIETarget(DNN_TARGET_CPU))
119 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU));
120 if (checkIETarget(DNN_TARGET_MYRIAD))
121 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
122 if (checkIETarget(DNN_TARGET_FPGA))
123 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_FPGA));
125 if (cv::ocl::useOpenCL() && ocl::Device::getDefault().isIntel())
127 if (checkIETarget(DNN_TARGET_OPENCL))
128 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL));
129 if (checkIETarget(DNN_TARGET_OPENCL_FP16))
130 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
133 #endif // HAVE_INF_ENGINE
136 if (cv::ocl::useOpenCL())
138 backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL));
139 backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16));
143 backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
147 backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
152 backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA));
153 backends.push_back(std::make_pair(DNN_BACKEND_CUDA, DNN_TARGET_CUDA_FP16));
157 static inline bool checkIETarget(int target)
159 #ifndef HAVE_INF_ENGINE
163 cv::dnn::LayerParams lp;
164 lp.set("kernel_size", 1);
165 lp.set("num_output", 1);
166 lp.set("bias_term", false);
167 lp.type = "Convolution";
168 lp.name = "testLayer";
169 lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
170 net.addLayerToPrev(lp.name, lp.type, lp);
171 net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
172 net.setPreferableTarget(target);
173 static int inpDims[] = {1, 2, 3, 4};
174 net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
187 BackendsList backends;
191 std::vector< std::pair<Backend, Target> > getAvailableBackends()
193 return BackendRegistry::getRegistry().getBackends();
196 std::vector<Target> getAvailableTargets(Backend be)
198 if (be == DNN_BACKEND_DEFAULT)
199 be = (Backend)PARAM_DNN_BACKEND_DEFAULT;
201 std::vector<Target> result;
202 const BackendRegistry::BackendsList all_backends = getAvailableBackends();
203 for(BackendRegistry::BackendsList::const_iterator i = all_backends.begin(); i != all_backends.end(); ++i )
206 result.push_back(i->second);
211 //==================================================================================================
215 typedef std::vector<MatShape> ShapesVec;
219 ShapesVec in, out, internal;
220 // No guarantees that layer which support in-place computations
221 // will be computed in-place (input.data_ptr == output.data_ptr).
222 // If layer said that it could work in-place and layers after it
223 // no longer use input blob, we'll set output = input.
225 LayerShapes() {supportInPlace = false;}
229 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
230 const Scalar& mean, bool swapRB, bool crop, int ddepth)
234 blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
238 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
239 const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
242 std::vector<Mat> images(1, image.getMat());
243 blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
246 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
247 const Scalar& mean, bool swapRB, bool crop, int ddepth)
251 blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
255 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
256 Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
259 CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
262 CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
263 CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
266 std::vector<Mat> images;
267 images_.getMatVector(images);
268 CV_Assert(!images.empty());
269 for (size_t i = 0; i < images.size(); i++)
271 Size imgSize = images[i].size();
278 float resizeFactor = std::max(size.width / (float)imgSize.width,
279 size.height / (float)imgSize.height);
280 resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
281 Rect crop(Point(0.5 * (images[i].cols - size.width),
282 0.5 * (images[i].rows - size.height)),
284 images[i] = images[i](crop);
287 resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
289 if(images[i].depth() == CV_8U && ddepth == CV_32F)
290 images[i].convertTo(images[i], CV_32F);
293 std::swap(mean[0], mean[2]);
296 images[i] *= scalefactor;
299 size_t nimages = images.size();
300 Mat image0 = images[0];
301 int nch = image0.channels();
302 CV_Assert(image0.dims == 2);
303 if (nch == 3 || nch == 4)
305 int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
306 blob_.create(4, sz, ddepth);
307 Mat blob = blob_.getMat();
310 for(size_t i = 0; i < nimages; i++ )
312 const Mat& image = images[i];
313 CV_Assert(image.depth() == blob_.depth());
314 nch = image.channels();
315 CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
316 CV_Assert(image.size() == image0.size());
318 for( int j = 0; j < nch; j++ )
319 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
321 std::swap(ch[0], ch[2]);
328 int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
329 blob_.create(4, sz, ddepth);
330 Mat blob = blob_.getMat();
332 for(size_t i = 0; i < nimages; i++ )
334 const Mat& image = images[i];
335 CV_Assert(image.depth() == blob_.depth());
336 nch = image.channels();
337 CV_Assert(image.dims == 2 && (nch == 1));
338 CV_Assert(image.size() == image0.size());
340 image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
345 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
349 //A blob is a 4 dimensional matrix in floating point precision
350 //blob_[0] = batchSize = nbOfImages
351 //blob_[1] = nbOfChannels
354 CV_Assert(blob_.depth() == CV_32F);
355 CV_Assert(blob_.dims == 4);
357 images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
359 std::vector<Mat> vectorOfChannels(blob_.size[1]);
360 for (int n = 0; n < blob_.size[0]; ++n)
362 for (int c = 0; c < blob_.size[1]; ++c)
364 vectorOfChannels[c] = getPlane(blob_, n, c);
366 cv::merge(vectorOfChannels, images_.getMatRef(n));
370 class OpenCLBackendWrapper : public BackendWrapper
373 OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
380 OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
381 : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
383 Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
384 CV_Assert(!base.empty());
388 int shape[] = {1, (int)base->umat.total()};
389 umat = base->umat.reshape(1, 2, &shape[0])
390 .colRange(0, host->total())
391 .reshape(1, host->dims, &host->size[0]);
395 static Ptr<BackendWrapper> create(Mat& m)
397 return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
400 static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
402 return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
405 static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
407 const int numWrappers = wrappers.size();
408 std::vector<UMat> mats(wrappers.size());
409 for (int i = 0; i < numWrappers; ++i)
411 Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
412 CV_Assert(!umatWrapper.empty());
413 umatWrapper->copyToDevice();
414 mats[i] = umatWrapper->umat;
419 // Replaces all umats in wrappers to specific ones.
420 static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
421 const std::vector<UMat>& umats)
423 CV_Assert(wrappers.size() == umats.size());
424 for (int i = 0, n = umats.size(); i < n; ++i)
426 Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
427 CV_Assert(!umatWrapper.empty());
428 umatWrapper->umat = umats[i];
432 ~OpenCLBackendWrapper() {}
434 // Copies data from device to a host memory.
435 virtual void copyToHost() CV_OVERRIDE
440 virtual void setHostDirty() CV_OVERRIDE
465 LayerPin(int layerId = -1, int outputId = -1)
466 : lid(layerId), oid(outputId) {}
470 return (lid >= 0 && oid >= 0);
473 bool equal(const LayerPin &r) const
475 return (lid == r.lid && oid == r.oid);
478 bool operator<(const LayerPin &r) const
480 return lid < r.lid || (lid == r.lid && oid < r.oid);
483 bool operator ==(const LayerPin &r) const
485 return lid == r.lid && oid == r.oid;
491 LayerData() : id(-1), skip(false), flag(0) {}
492 LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
493 : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
507 std::vector<LayerPin> inputBlobsId;
508 std::set<int> inputLayersId;
509 std::set<int> requiredOutputs;
510 std::vector<LayerPin> consumers;
511 std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
512 std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
513 std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
515 Ptr<Layer> layerInstance;
516 std::vector<Mat> outputBlobs;
517 std::vector<Mat*> inputBlobs;
518 std::vector<Mat> internals;
519 // Computation nodes of implemented backends (except DEFAULT).
520 std::map<int, Ptr<BackendNode> > backendNodes;
521 // Flag for skip layer computation for specific backend.
526 Ptr<Layer> getLayerInstance()
529 CV_TRACE_ARG_VALUE(type, "type", type.c_str());
532 return layerInstance;
534 layerInstance = LayerFactory::createLayerInstance(type, params);
537 CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
540 return layerInstance;
544 //fake layer containing network input blobs
545 struct DataLayer : public Layer
547 DataLayer() : Layer()
552 virtual bool supportBackend(int backendId) CV_OVERRIDE
554 return backendId == DNN_BACKEND_OPENCV ||
555 (backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1);
558 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
561 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
563 CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
564 forward_ocl(inputs_arr, outputs_arr, internals_arr))
566 if (outputs_arr.depth() == CV_16S)
568 forward_fallback(inputs_arr, outputs_arr, internals_arr);
572 std::vector<Mat> outputs, internals;
573 outputs_arr.getMatVector(outputs);
574 internals_arr.getMatVector(internals);
577 // | Input type | Output type |
580 for (int i = 0; i < inputsData.size(); ++i)
582 double scale = scaleFactors[i];
583 Scalar& mean = means[i];
584 CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
585 CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
587 bool singleMean = true;
588 for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
590 singleMean = mean[j] == mean[j - 1];
595 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
599 for (int n = 0; n < inputsData[i].size[0]; ++n)
600 for (int c = 0; c < inputsData[i].size[1]; ++c)
602 Mat inp = getPlane(inputsData[i], n, c);
603 Mat out = getPlane(outputs[i], n, c);
604 inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
611 std::vector<Mat> tmp_expressions;
612 bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
615 // | Input type | Output type |
619 std::vector<UMat> outputs;
620 outputs_.getUMatVector(outputs);
622 tmp_expressions.clear();
623 for (int i = 0; i < inputsData.size(); ++i)
625 Mat inputData = inputsData[i];
627 double scale = scaleFactors[i];
628 Scalar& mean = means[i];
630 CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
631 bool singleMean = true;
632 for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
634 singleMean = mean[j] == mean[j - 1];
637 if (outputs_.depth() == CV_16S)
641 tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
642 convertFp16(tmp_expressions.back(), outputs[i]);
646 for (int n = 0; n < inputsData[i].size[0]; ++n)
647 for (int c = 0; c < inputsData[i].size[1]; ++c)
649 Mat inp = getPlane(inputsData[i], n, c);
651 std::vector<cv::Range> plane(4, Range::all());
652 plane[0] = Range(n, n + 1);
653 plane[1] = Range(c, c + 1);
654 UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
656 tmp_expressions.push_back(scale * (inp - mean[c]));
657 convertFp16(tmp_expressions.back(), out);
663 CV_Assert(outputs_.depth() == CV_32F);
666 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
670 for (int n = 0; n < inputsData[i].size[0]; ++n)
671 for (int c = 0; c < inputsData[i].size[1]; ++c)
673 Mat inp = getPlane(inputsData[i], n, c);
675 std::vector<cv::Range> plane(4, Range::all());
676 plane[0] = Range(n, n + 1);
677 plane[1] = Range(c, c + 1);
678 UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
680 inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
689 int outputNameToIndex(const String& tgtName) CV_OVERRIDE
691 int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
692 return (idx < (int)outNames.size()) ? idx : -1;
695 void setNames(const std::vector<String> &names)
697 outNames.assign(names.begin(), names.end());
700 bool getMemoryShapes(const std::vector<MatShape> &inputs,
701 const int requiredOutputs,
702 std::vector<MatShape> &outputs,
703 std::vector<MatShape> &internals) const CV_OVERRIDE
705 CV_Assert(inputs.size() == requiredOutputs);
706 outputs.assign(inputs.begin(), inputs.end());
710 virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
712 std::vector<Mat> outputs;
713 outputs_arr.getMatVector(outputs);
715 CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
716 inputsData.size() == outputs.size());
718 for (int i = 0; skip && i < inputsData.size(); ++i)
720 if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
725 virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
727 #ifdef HAVE_INF_ENGINE
728 CV_CheckEQ(inputsData.size(), (size_t)1, "");
729 CV_CheckEQ(inputsData[0].dims, 4, "");
730 const size_t numChannels = inputsData[0].size[1];
731 CV_Assert(numChannels <= 4);
734 InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
735 InferenceEngine::Layout::C);
736 auto weights = InferenceEngine::make_shared_blob<float>(td);
739 float* weight_buf = weights->buffer().as<float*>();
740 std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
743 auto biases = InferenceEngine::make_shared_blob<float>(td);
745 float* bias_buf = biases->buffer().as<float*>();
747 for (int i = 0; i < numChannels; ++i)
749 bias_buf[i] = -means[0][i] * scaleFactors[0];
752 InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
753 addConstantData("weights", weights, ieLayer);
754 addConstantData("biases", biases, ieLayer);
755 return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
756 #endif // HAVE_INF_ENGINE
757 return Ptr<BackendNode>();
760 std::vector<String> outNames;
761 // Preprocessing parameters for each network's input.
762 std::vector<double> scaleFactors;
763 std::vector<Scalar> means;
764 std::vector<Mat> inputsData;
771 // Increase references counter to layer output.
772 void addReference(const LayerPin& lp)
774 std::map<LayerPin, int>::iterator it = refCounter.find(lp);
775 if (it == refCounter.end())
781 void addReferences(const std::vector<LayerPin>& pins)
783 for (int i = 0; i < pins.size(); i++)
785 addReference(pins[i]);
789 // Returns number of references to allocated memory that used in specific
791 int numReferences(const LayerPin& lp)
793 std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
794 CV_Assert(mapIt != reuseMap.end());
795 LayerPin memHost = mapIt->second;
797 std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
798 CV_Assert(refIt != refCounter.end());
799 return refIt->second;
802 // Reuse data allocated in <host> inside the <user> blob.
803 void reuse(const LayerPin& host, const LayerPin& user)
805 CV_Assert(reuseMap.find(user) == reuseMap.end());
806 CV_Assert(reuseMap.find(host) != reuseMap.end());
807 LayerPin memHost = reuseMap[host];
808 reuseMap[user] = memHost;
809 if (refCounter.find(memHost) != refCounter.end())
811 std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
812 if (userRefIt != refCounter.end())
814 refCounter[memHost] += userRefIt->second;
815 refCounter.erase(userRefIt);
818 refCounter[memHost] += 1;
822 // Decrease references counter to allocated memory inside specific blob.
823 void releaseReference(const LayerPin& lp)
825 std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
826 CV_Assert(mapIt != reuseMap.end());
828 std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
829 CV_Assert(refIt != refCounter.end());
830 CV_Assert(refIt->second > 0);
834 void releaseReferences(const std::vector<LayerPin>& pins)
836 for (int i = 0; i < pins.size(); i++)
838 releaseReference(pins[i]);
842 void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
844 if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
847 LayerPin bestBlobPin;
849 std::map<LayerPin, Mat>::iterator hostIt;
850 std::map<LayerPin, int>::iterator refIt;
852 const int targetTotal = total(shape);
853 int bestBlobTotal = INT_MAX;
855 for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
857 refIt = refCounter.find(hostIt->first);
858 // Use only blobs that had references before because if not,
859 // it might be used as output.
860 if (refIt != refCounter.end() && refIt->second == 0)
862 Mat& unusedBlob = hostIt->second;
863 if (unusedBlob.total() >= targetTotal &&
864 unusedBlob.total() < bestBlobTotal)
866 bestBlobPin = hostIt->first;
867 bestBlob = unusedBlob;
868 bestBlobTotal = unusedBlob.total();
872 if (!bestBlob.empty())
874 reuse(bestBlobPin, lp);
875 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
881 // if dst already has been allocated with total(shape) elements,
882 // it won't be recreated and pointer of dst.data remains the same.
883 dst.create(shape, use_half ? CV_16S : CV_32F);
888 void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
889 std::vector<LayerPin>& pinsForInternalBlobs,
890 bool use_half = false)
894 pinsForInternalBlobs.clear();
896 std::vector<Mat>& outputBlobs = ld.outputBlobs,
897 &internalBlobs = ld.internals;
899 const ShapesVec& outShapes = layerShapes.out,
900 internalShapes = layerShapes.internal;
902 outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
903 internalBlobs.resize(internalShapes.size());
905 CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
907 // Check that layer could work in-place.
908 bool inPlace = false;
909 if (layerShapes.supportInPlace)
911 if (ld.inputBlobs.size() == 1)
913 // Get number of references to the input memory.
914 int numRef = numReferences(ld.inputBlobsId[0]);
915 // If current layer is one and only customer of this blob.
916 inPlace = numRef == 1;
920 ShapesVec shapes(outShapes);
921 shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
922 std::vector<Mat*> blobs;
923 for(int i = 0; i < outputBlobs.size(); i++)
925 blobs.push_back(&outputBlobs[i]);
928 for(int i = 0; i < internalBlobs.size(); i++)
930 blobs.push_back(&internalBlobs[i]);
931 if (total(internalShapes[i]))
933 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
937 addReferences(pinsForInternalBlobs);
939 std::map<int, std::vector<int> > idxSizes;
940 for(int i = 0; i < shapes.size(); i++)
942 idxSizes[total(shapes[i])].push_back(i);
945 std::map<int, std::vector<int> >::reverse_iterator it;
946 for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
948 for(int j = 0; j < it->second.size(); j++)
950 int index = it->second[j];
951 if (total(shapes[index]))
953 LayerPin blobPin(ld.id, index);
954 if (index < outShapes.size() && inPlace)
956 CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
957 ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
958 reuse(ld.inputBlobsId[0], blobPin);
961 reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
967 // Clear internal state. Calls before an every reallocation.
978 // Register allocated memory.
979 void addHost(const LayerPin& lp, const Mat& mat)
981 CV_Assert(memHosts.find(lp) == memHosts.end());
986 std::map<LayerPin, int> refCounter;
987 // Maps pin to origin blob (for whom memory was allocated firstly).
988 // For origin blobs key == value.
989 std::map<LayerPin, LayerPin> reuseMap;
990 std::map<LayerPin, Mat> memHosts;
993 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
995 if (backendId == DNN_BACKEND_OPENCV)
997 if (targetId == DNN_TARGET_CPU)
998 return Ptr<BackendWrapper>();
999 else if (IS_DNN_OPENCL_TARGET(targetId))
1000 return OpenCLBackendWrapper::create(m);
1002 CV_Error(Error::StsNotImplemented, "Unknown target identifier");
1004 else if (backendId == DNN_BACKEND_HALIDE)
1006 CV_Assert(haveHalide());
1008 return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
1009 #endif // HAVE_HALIDE
1011 else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
1013 CV_Assert(haveInfEngine());
1014 #ifdef HAVE_INF_ENGINE
1015 return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
1016 #endif // HAVE_INF_ENGINE
1018 else if (backendId == DNN_BACKEND_VKCOM)
1020 CV_Assert(haveVulkan());
1022 return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
1023 #endif // HAVE_VULKAN
1025 else if (backendId == DNN_BACKEND_CUDA)
1027 CV_Assert(haveCUDA());
1032 case DNN_TARGET_CUDA:
1033 return CUDABackendWrapperFP32::create(m);
1034 case DNN_TARGET_CUDA_FP16:
1035 return CUDABackendWrapperFP16::create(m);
1037 CV_Assert(IS_DNN_CUDA_TARGET(targetId));
1042 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1043 return Ptr<BackendWrapper>();
1048 typedef std::map<int, LayerShapes> LayersShapesMap;
1049 typedef std::map<int, LayerData> MapIdToLayerData;
1053 //allocate fake net input layer
1054 netInputLayer = Ptr<DataLayer>(new DataLayer());
1055 LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
1057 netInputLayer->name = inpl.name = "_input";
1058 inpl.type = "__NetInputLayer__";
1059 inpl.layerInstance = netInputLayer;
1060 layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
1063 netWasAllocated = false;
1066 preferableBackend = DNN_BACKEND_DEFAULT;
1067 preferableTarget = DNN_TARGET_CPU;
1068 skipInfEngineInit = false;
1071 if (cv::cuda::getCudaEnabledDeviceCount() > 0)
1073 cuda4dnn::csl::CSLContext context;
1074 context.stream = cuda4dnn::csl::Stream(true);
1075 context.cublas_handle = cuda4dnn::csl::cublas::Handle(context.stream);
1076 context.cudnn_handle = cuda4dnn::csl::cudnn::Handle(context.stream);
1078 cudaInfo = std::unique_ptr<CudaInfo_t>(new CudaInfo_t(std::move(context)));
1083 Ptr<DataLayer> netInputLayer;
1084 std::vector<LayerPin> blobsToKeep;
1085 MapIdToLayerData layers;
1086 std::map<String, int> layerNameToId;
1087 BlobManager blobManager;
1088 int preferableBackend;
1089 int preferableTarget;
1090 String halideConfigFile;
1091 bool skipInfEngineInit;
1092 // Map host data to backend specific wrapper.
1093 std::map<void*, Ptr<BackendWrapper> > backendWrappers;
1097 bool netWasAllocated;
1100 std::vector<int64> layersTimings;
1106 CudaInfo_t(cuda4dnn::csl::CSLContext ctxt) : context(std::move(ctxt)) { }
1107 cuda4dnn::csl::CSLContext context;
1108 cuda4dnn::csl::Workspace workspace;
1111 std::unique_ptr<CudaInfo_t> cudaInfo;
1114 Ptr<BackendWrapper> wrap(Mat& host)
1116 if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
1117 return Ptr<BackendWrapper>();
1119 MatShape shape(host.dims);
1120 for (int i = 0; i < host.dims; ++i)
1121 shape[i] = host.size[i];
1123 void* data = host.data;
1124 if (backendWrappers.find(data) != backendWrappers.end())
1126 Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
1127 if (preferableBackend == DNN_BACKEND_OPENCV)
1129 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
1130 return OpenCLBackendWrapper::create(baseBuffer, host);
1132 else if (preferableBackend == DNN_BACKEND_HALIDE)
1134 CV_Assert(haveHalide());
1136 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
1137 #endif // HAVE_HALIDE
1139 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1141 return wrapMat(preferableBackend, preferableTarget, host);
1143 else if (preferableBackend == DNN_BACKEND_VKCOM)
1146 return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
1149 else if (preferableBackend == DNN_BACKEND_CUDA)
1151 CV_Assert(haveCUDA());
1153 switch (preferableTarget)
1155 case DNN_TARGET_CUDA:
1156 return CUDABackendWrapperFP32::create(baseBuffer, shape);
1157 case DNN_TARGET_CUDA_FP16:
1158 return CUDABackendWrapperFP16::create(baseBuffer, shape);
1160 CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
1165 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1168 Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
1169 backendWrappers[data] = wrapper;
1174 void compileHalide()
1176 CV_TRACE_FUNCTION();
1178 CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
1180 HalideScheduler scheduler(halideConfigFile);
1181 std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
1182 for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
1184 LayerData &ld = it->second;
1185 Ptr<Layer> layer = ld.layerInstance;
1186 if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
1188 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
1189 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
1192 // Use automatic scheduling provided by layer.
1193 layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1194 ld.inputBlobs, ld.outputBlobs,
1197 compileList.emplace_back(ld);
1200 std::atomic<int> progress(0);
1201 auto fn = ([&] () -> void
1205 int id = progress.fetch_add(1);
1206 if ((size_t)id >= compileList.size())
1208 const LayerData& ld = compileList[id].get();
1209 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1210 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1213 size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1214 num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1215 std::vector<std::thread> threads(num_threads - 1);
1216 for (auto& t: threads) t = std::thread(fn);
1217 fn(); // process own tasks
1218 for (auto& t: threads) t.join();
1224 CV_TRACE_FUNCTION();
1226 MapIdToLayerData::iterator it;
1227 for (it = layers.begin(); it != layers.end(); it++)
1229 if (it->second.id != 0) {
1230 it->second.inputBlobs.clear();
1231 it->second.outputBlobs.clear();
1232 it->second.internals.clear();
1234 it->second.skip = false;
1235 //it->second.consumers.clear();
1236 Ptr<Layer> currLayer = it->second.layerInstance;
1238 if( currLayer.empty() )
1241 currLayer->unsetAttached();
1244 layersTimings.clear();
1247 void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1249 CV_TRACE_FUNCTION();
1251 if (preferableBackend == DNN_BACKEND_DEFAULT)
1252 preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1254 CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1255 preferableTarget == DNN_TARGET_CPU ||
1256 preferableTarget == DNN_TARGET_OPENCL ||
1257 preferableTarget == DNN_TARGET_OPENCL_FP16);
1258 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1259 preferableTarget == DNN_TARGET_CPU ||
1260 preferableTarget == DNN_TARGET_OPENCL);
1261 CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
1262 preferableTarget == DNN_TARGET_CPU ||
1263 preferableTarget == DNN_TARGET_OPENCL ||
1264 preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1265 preferableTarget == DNN_TARGET_MYRIAD ||
1266 preferableTarget == DNN_TARGET_FPGA);
1267 CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
1268 preferableTarget == DNN_TARGET_VULKAN);
1269 CV_Assert(preferableBackend != DNN_BACKEND_CUDA ||
1270 IS_DNN_CUDA_TARGET(preferableTarget));
1272 if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1274 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1277 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1278 preferableTarget = DNN_TARGET_CPU;
1282 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1284 // Current implementation is only valid for GPU (#11494)
1285 if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1287 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1288 preferableTarget = DNN_TARGET_CPU;
1290 else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1292 CV_LOG_WARNING(NULL,
1293 "DNN: OpenCL target with fp16 precision is not supported "
1294 "with current OpenCL device (tested with Intel GPUs only), "
1295 "switching to OpenCL with fp32 precision.");
1296 preferableTarget = DNN_TARGET_OPENCL;
1301 if (preferableBackend == DNN_BACKEND_VKCOM && !haveVulkan())
1303 preferableBackend = DNN_BACKEND_OPENCV;
1304 preferableTarget = DNN_TARGET_CPU;
1307 if (preferableBackend == DNN_BACKEND_CUDA && !haveCUDA())
1310 CV_LOG_WARNING(NULL, "unable to use CUDA backend; switching to CPU");
1312 CV_LOG_WARNING(NULL, "DNN module was not built with CUDA backend; switching to CPU");
1314 preferableBackend = DNN_BACKEND_OPENCV;
1315 preferableTarget = DNN_TARGET_CPU;
1320 allocateLayers(blobsToKeep_);
1322 MapIdToLayerData::iterator it = layers.find(0);
1323 CV_Assert(it != layers.end());
1324 it->second.skip = netInputLayer->skip;
1328 if (!netWasAllocated)
1331 if (preferableBackend == DNN_BACKEND_HALIDE)
1334 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1338 netWasAllocated = true;
1339 this->blobsToKeep = blobsToKeep_;
1343 int getLayerId(const String &layerName)
1345 std::map<String, int>::iterator it = layerNameToId.find(layerName);
1346 return (it != layerNameToId.end()) ? it->second : -1;
1349 int getLayerId(int id)
1351 MapIdToLayerData::iterator it = layers.find(id);
1352 return (it != layers.end()) ? id : -1;
1355 int getLayerId(DictValue &layerDesc)
1357 if (layerDesc.isInt())
1358 return getLayerId(layerDesc.get<int>());
1359 else if (layerDesc.isString())
1360 return getLayerId(layerDesc.get<String>());
1362 CV_Assert(layerDesc.isInt() || layerDesc.isString());
1366 String getLayerName(int id)
1368 MapIdToLayerData::iterator it = layers.find(id);
1369 return (it != layers.end()) ? it->second.name : "(unknown layer)";
1372 LayerData& getLayerData(int id)
1374 MapIdToLayerData::iterator it = layers.find(id);
1376 if (it == layers.end())
1377 CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1382 LayerData& getLayerData(const String &layerName)
1384 int id = getLayerId(layerName);
1387 CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1389 return getLayerData(id);
1392 LayerData& getLayerData(const DictValue &layerDesc)
1394 CV_Assert(layerDesc.isInt() || layerDesc.isString());
1395 if (layerDesc.isInt())
1396 return getLayerData(layerDesc.get<int>());
1397 else /*if (layerDesc.isString())*/
1398 return getLayerData(layerDesc.get<String>());
1401 static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1403 if ((int)ld.inputBlobsId.size() <= inNum)
1405 ld.inputBlobsId.resize(inNum + 1);
1409 LayerPin storedFrom = ld.inputBlobsId[inNum];
1410 if (storedFrom.valid() && !storedFrom.equal(from))
1411 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1412 inNum, ld.name.c_str()));
1415 ld.inputBlobsId[inNum] = from;
1418 int resolvePinOutputName(LayerData &ld, const String &outName)
1420 if (outName.empty())
1422 return ld.getLayerInstance()->outputNameToIndex(outName);
1425 LayerPin getPinByAlias(const String &layerName)
1428 pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1431 pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1436 std::vector<LayerPin> getLayerOutPins(const String &layerName)
1438 int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1440 std::vector<LayerPin> pins;
1442 for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1444 pins.push_back(LayerPin(lid, i));
1450 void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1452 CV_Assert(outLayerId < inLayerId);
1453 LayerData &ldOut = getLayerData(outLayerId);
1454 LayerData &ldInp = getLayerData(inLayerId);
1456 addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1457 ldOut.requiredOutputs.insert(outNum);
1458 ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1463 CV_TRACE_FUNCTION();
1464 if (preferableBackend == DNN_BACKEND_OPENCV)
1465 CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1466 else if (preferableBackend == DNN_BACKEND_HALIDE)
1467 initHalideBackend();
1468 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1469 initInfEngineBackend();
1470 else if (preferableBackend == DNN_BACKEND_VKCOM)
1472 else if (preferableBackend == DNN_BACKEND_CUDA)
1475 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1478 void initHalideBackend()
1480 CV_TRACE_FUNCTION();
1481 CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1483 // Iterator to current layer.
1484 MapIdToLayerData::iterator it = layers.begin();
1485 // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1486 // it'll be a conv layer.
1487 MapIdToLayerData::iterator baseIt = layers.begin();
1488 for (; it != layers.end(); it++)
1490 LayerData &ldTop = it->second;
1491 Ptr<Layer> layerTop = ldTop.layerInstance;
1492 if (!layerTop->supportBackend(preferableBackend))
1494 // Move base iterator to layer that don't support preferable
1495 // backend to prevent fusion over layer of different backend.
1499 // Try to do layers fusion.
1500 LayerData &ldBot = baseIt->second;
1501 Ptr<Layer> layerBot = ldBot.layerInstance;
1502 // 1. Check that bottom and top from the same backends.
1503 if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1505 // 2. Check that current layer works in-place.
1506 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1507 ldBot.outputBlobs.size() == 1 &&
1508 ldTop.inputBlobs[0]->data ==
1509 ldBot.outputBlobs[0].data;
1512 // 3. Try to attach node.
1513 CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1514 Ptr<BackendNode> fusedNode =
1515 layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1516 if (!fusedNode.empty())
1519 ldBot.backendNodes[preferableBackend] = fusedNode;
1520 ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1525 // No layers fusion.
1527 ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1528 layerTop->initHalide(ldTop.inputBlobsWrappers);
1533 #ifdef HAVE_INF_ENGINE
1534 // Before launching Inference Engine graph we need to specify output blobs.
1535 // This function requests output blobs based on inputs references of
1536 // layers from default backend or layers from different graphs.
1537 void addInfEngineNetOutputs(LayerData &ld)
1539 Ptr<InfEngineBackendNet> layerNet;
1540 if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1542 Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1545 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1546 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1547 layerNet = ieNode->net;
1550 // For an every input reference we check that it belongs to one of
1551 // the Inference Engine backend graphs. Request an output blob if it is.
1552 // Do nothing if layer's input is from the same graph.
1553 for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1555 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1556 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1557 if (!inpNode.empty())
1559 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1560 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1561 if (layerNet != ieInpNode->net)
1563 // layerNet is empty or nodes are from different graphs.
1564 ieInpNode->net->addOutput(ieInpNode->layer.getName());
1569 #endif // HAVE_INF_ENGINE
1571 void initVkComBackend()
1573 CV_TRACE_FUNCTION();
1574 CV_Assert(preferableBackend == DNN_BACKEND_VKCOM);
1579 MapIdToLayerData::iterator it = layers.begin();
1580 for (; it != layers.end(); it++)
1582 LayerData &ld = it->second;
1583 Ptr<Layer> layer = ld.layerInstance;
1584 if (!layer->supportBackend(preferableBackend))
1593 ld.backendNodes[DNN_BACKEND_VKCOM] =
1594 layer->initVkCom(ld.inputBlobsWrappers);
1596 catch (const cv::Exception& e)
1598 CV_LOG_ERROR(NULL, "initVkCom failed, fallback to CPU implementation. " << e.what());
1599 ld.backendNodes[DNN_BACKEND_VKCOM] = Ptr<BackendNode>();
1605 void initInfEngineBackend()
1607 CV_TRACE_FUNCTION();
1608 CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE, haveInfEngine());
1609 #ifdef HAVE_INF_ENGINE
1610 MapIdToLayerData::iterator it;
1611 Ptr<InfEngineBackendNet> net;
1613 for (it = layers.begin(); it != layers.end(); ++it)
1615 LayerData &ld = it->second;
1618 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1619 (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1620 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1622 InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1623 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1624 dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1626 dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
1632 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1634 InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1635 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1636 dataPtr->name = ld.name;
1638 dataPtr->setName(ld.name);
1644 if (skipInfEngineInit)
1646 Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1647 CV_Assert(!node.empty());
1649 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1650 CV_Assert(!ieNode.empty());
1652 for (it = layers.begin(); it != layers.end(); ++it)
1654 LayerData &ld = it->second;
1657 for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1659 InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1660 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1661 dataPtr->name = netInputLayer->outNames[i];
1663 dataPtr->setName(netInputLayer->outNames[i]);
1669 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1671 InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1672 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1673 dataPtr->name = ld.name;
1675 dataPtr->setName(ld.name);
1679 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1680 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1683 layers[lastLayerId].skip = false;
1684 ieNode->net->init(preferableTarget);
1688 // Build Inference Engine networks from sets of layers that support this
1689 // backend. Split a whole model on several Inference Engine networks if
1690 // some of layers are not implemented.
1692 // Set of all input and output blobs wrappers for current network.
1693 std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1694 for (it = layers.begin(); it != layers.end(); ++it)
1696 LayerData &ld = it->second;
1697 if (ld.id == 0 && ld.skip)
1699 bool fused = ld.skip;
1701 Ptr<Layer> layer = ld.layerInstance;
1702 if (!fused && !layer->supportBackend(preferableBackend))
1704 bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
1705 INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2);
1706 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
1707 if (preferableTarget == DNN_TARGET_MYRIAD)
1709 for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
1711 customizable = ld.inputBlobs[i]->size[0] == 1;
1715 // TODO: fix these workarounds
1716 if (preferableTarget == DNN_TARGET_MYRIAD ||
1717 preferableTarget == DNN_TARGET_OPENCL ||
1718 preferableTarget == DNN_TARGET_OPENCL_FP16)
1719 customizable &= ld.type != "Concat";
1721 if (preferableTarget == DNN_TARGET_OPENCL ||
1722 preferableTarget == DNN_TARGET_OPENCL_FP16)
1723 customizable &= ld.type != "Power";
1725 if (preferableTarget == DNN_TARGET_OPENCL)
1726 customizable &= ld.type != "Eltwise";
1730 addInfEngineNetOutputs(ld);
1731 net = Ptr<InfEngineBackendNet>();
1732 netBlobsWrappers.clear(); // Is not used for R5 release but we don't wrap it to #ifdef.
1733 layer->preferableTarget = DNN_TARGET_CPU;
1737 ld.skip = true; // Initially skip all Inference Engine supported layers.
1739 // Create a new network if one of inputs from different Inference Engine graph.
1740 for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1742 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1743 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1744 if (!inpNode.empty())
1746 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1747 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1748 if (ieInpNode->net != net)
1750 net = Ptr<InfEngineBackendNet>();
1751 netBlobsWrappers.clear(); // Is not used for R5 release but we don't wrap it to #ifdef.
1757 Ptr<BackendNode> node;
1762 bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1763 ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1765 node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1766 ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1770 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1774 if (layer->supportBackend(preferableBackend))
1775 node = layer->initInfEngine(ld.inputBlobsWrappers);
1778 node = Ptr<BackendNode>(new InfEngineBackendNode(
1779 ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
1782 else if (node.empty())
1785 CV_Assert(!node.empty());
1786 ld.backendNodes[preferableBackend] = node;
1788 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1789 CV_Assert(!ieNode.empty());
1792 // Convert weights in FP16 for specific targets.
1793 if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1794 preferableTarget == DNN_TARGET_MYRIAD ||
1795 preferableTarget == DNN_TARGET_FPGA) && !fused)
1797 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
1798 for (const std::string& name : {"weights", "biases"})
1800 auto it = ieNode->layer.getParameters().find(name);
1801 if (it != ieNode->layer.getParameters().end())
1803 InferenceEngine::Blob::Ptr bp = it->second.as<InferenceEngine::Blob::Ptr>();
1804 it->second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(bp));
1808 auto& blobs = ieNode->layer.getConstantData();
1811 // In case of non weightable layer we have to specify
1812 // it's precision adding dummy blob.
1813 auto blob = InferenceEngine::make_shared_blob<int16_t>(
1814 InferenceEngine::Precision::FP16,
1815 InferenceEngine::Layout::C, {1});
1821 for (auto& it : blobs)
1822 it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
1828 net->addLayer(ieNode->layer);
1830 net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
1831 net->addBlobs(ld.inputBlobsWrappers);
1832 net->addBlobs(ld.outputBlobsWrappers);
1833 addInfEngineNetOutputs(ld);
1836 // Initialize all networks.
1837 for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1839 LayerData &ld = it->second;
1840 if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1843 Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1847 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1851 CV_Assert(!ieNode->net.empty());
1853 if (!ieNode->net->isInitialized())
1855 ieNode->net->init(preferableTarget);
1859 #endif // HAVE_INF_ENGINE
1862 void initCUDABackend() {
1863 CV_Assert(haveCUDA());
1866 for (auto& layer : layers)
1868 auto& ld = layer.second;
1869 auto& layerInstance = ld.layerInstance;
1871 if (!layerInstance->supportBackend(DNN_BACKEND_CUDA))
1873 std::ostringstream os;
1874 os << "CUDA backend will fallback to the CPU implementation for the layer \"" << ld.name
1875 << "\" of type " << ld.type << '\n';
1876 CV_LOG_INFO(NULL, os.str().c_str());
1880 /* we make a copy so that `initCUDA` doesn't modify `cudaInfo->context` */
1881 auto context = cudaInfo->context;
1882 auto node = layerInstance->initCUDA(&context, ld.inputBlobsWrappers, ld.outputBlobsWrappers);
1883 ld.backendNodes[DNN_BACKEND_CUDA] = node;
1885 auto cudaNode = node.dynamicCast<CUDABackendNode>();
1886 cudaInfo->workspace.require(cudaNode->get_workspace_memory_in_bytes());
1891 void allocateLayer(int lid, const LayersShapesMap& layersShapes)
1893 CV_TRACE_FUNCTION();
1895 LayerData &ld = layers[lid];
1901 size_t ninputs = ld.inputBlobsId.size();
1903 printf("layer %s:", ld.name.c_str());
1904 for (size_t i = 0; i < ninputs; i++)
1906 int inp_lid = ld.inputBlobsId[i].lid;
1907 LayerData &inp_ld = layers[inp_lid];
1908 int inp_outputs = (int)inp_ld.outputBlobs.size();
1909 std::cout << " " << inp_ld.name << "(" << inp_outputs;
1911 for( int j = 0; j < inp_outputs; j++ )
1913 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
1920 //determine parent layers
1921 for (size_t i = 0; i < ninputs; i++)
1922 ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
1925 for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
1926 allocateLayer(*i, layersShapes);
1929 if (ld.id == 0) // DataLayer
1931 ninputs = netInputLayer->inputsData.size();
1932 ld.inputBlobsWrappers.resize(ninputs);
1933 for (size_t i = 0; i < ninputs; i++)
1935 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
1937 if (IS_DNN_CUDA_TARGET(preferableTarget))
1939 auto wrapper = ld.inputBlobsWrappers[i].dynamicCast<CUDABackendWrapper>();
1940 wrapper->setStream(cudaInfo->context.stream);
1947 ld.inputBlobs.resize(ninputs);
1948 ld.inputBlobsWrappers.resize(ninputs);
1949 for (size_t i = 0; i < ninputs; i++)
1951 LayerPin from = ld.inputBlobsId[i];
1952 CV_Assert(from.valid());
1953 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
1954 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
1955 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
1959 LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
1961 CV_Assert(layerShapesIt != layersShapes.end());
1963 std::vector<LayerPin> pinsForInternalBlobs;
1964 blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
1965 preferableBackend == DNN_BACKEND_OPENCV &&
1966 preferableTarget == DNN_TARGET_OPENCL_FP16);
1967 ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
1968 for (int i = 0; i < ld.outputBlobs.size(); ++i)
1970 ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
1972 if (IS_DNN_CUDA_TARGET(preferableTarget))
1974 auto wrapper = ld.outputBlobsWrappers[i].dynamicCast<CUDABackendWrapper>();
1975 wrapper->setStream(cudaInfo->context.stream);
1980 /* CUDA backend has its own system for internal blobs; we don't need these */
1981 ld.internalBlobsWrappers.resize((preferableBackend == DNN_BACKEND_CUDA) ? 0 : ld.internals.size());
1982 for (int i = 0; i < ld.internalBlobsWrappers.size(); ++i)
1984 ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
1987 Ptr<Layer> layerPtr = ld.getLayerInstance();
1989 std::vector<Mat> inps(ld.inputBlobs.size());
1990 for (int i = 0; i < ld.inputBlobs.size(); ++i)
1992 inps[i] = *ld.inputBlobs[i];
1994 layerPtr->finalize(inps, ld.outputBlobs);
1995 layerPtr->preferableTarget = preferableTarget;
1997 std::cout << "\toutputs:";
1998 size_t noutputs = ld.outputBlobs.size();
1999 for (size_t j = 0; j < noutputs; j++)
2001 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
2007 // After allocation of layer, we decrease counters to it's input blobs.
2008 blobManager.releaseReferences(ld.inputBlobsId);
2009 blobManager.releaseReferences(pinsForInternalBlobs);
2015 #define printf_(args) printf args
2017 #define printf_(args)
2020 void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
2022 if( !fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
2023 preferableBackend != DNN_BACKEND_CUDA &&
2024 preferableBackend != DNN_BACKEND_INFERENCE_ENGINE))
2027 CV_TRACE_FUNCTION();
2029 // scan through all the layers. If there is convolution layer followed by the activation layer,
2030 // we try to embed this activation into the convolution and disable separate execution of the activation
2031 std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
2032 blobsToKeep_.end());
2033 MapIdToLayerData::iterator it;
2034 for (it = layers.begin(); it != layers.end(); it++)
2036 int lid = it->first;
2037 LayerData& ld = layers[lid];
2040 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
2043 printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
2045 // the optimization #1. try to fuse batch norm, scaling and/or activation layers
2046 // with the current layer if they follow it. Normally, the are fused with the convolution layer,
2047 // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
2048 // some other layers.
2049 Ptr<Layer>& currLayer = ld.layerInstance;
2050 if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
2052 LayerData* nextData = &layers[ld.consumers[0].lid];
2053 LayerPin lpNext(ld.consumers[0].lid, 0);
2056 Ptr<Layer> nextLayer = nextData->layerInstance;
2057 if (currLayer->tryFuse(nextLayer))
2059 printf_(("\tfused with %s\n", nextLayer->name.c_str()));
2060 nextData->skip = true;
2061 ld.outputBlobs = layers[lpNext.lid].outputBlobs;
2062 ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
2063 if (nextData->consumers.size() == 1)
2065 int nextLayerId = nextData->consumers[0].lid;
2066 nextData = &layers[nextLayerId];
2067 lpNext = LayerPin(nextLayerId, 0);
2079 if (preferableBackend != DNN_BACKEND_OPENCV)
2080 continue; // Go to the next layer.
2082 // TODO: OpenCL target support more fusion styles.
2083 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
2084 (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
2085 ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
2086 ld.layerInstance->type != "Concat")) )
2091 // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
2092 if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
2093 nextData->type != "ReLU" &&
2094 nextData->type != "ChannelsPReLU" &&
2095 nextData->type != "ReLU6" &&
2096 nextData->type != "TanH" &&
2097 nextData->type != "Power")
2100 Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2101 if (nextActivLayer.empty())
2104 if (currLayer->setActivation(nextActivLayer))
2106 printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2107 nextData->skip = true;
2108 ld.outputBlobs = layers[lpNext.lid].outputBlobs;
2109 ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
2110 if (nextData->consumers.size() == 1)
2112 int nextLayerId = nextData->consumers[0].lid;
2113 nextData = &layers[nextLayerId];
2114 lpNext = LayerPin(nextLayerId, 0);
2126 // fuse convolution layer followed by eltwise + relu
2127 if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
2129 Ptr<EltwiseLayer> nextEltwiseLayer;
2131 nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
2133 if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2134 nextData && nextData->inputBlobsId.size() == 2 )
2136 LayerData *eltwiseData = nextData;
2138 // Eltwise layer has two inputs. We need to determine which
2139 // is a base convolution layer and which could be used as it's bias.
2140 LayerData* biasLayerData = 0;
2141 for (int i = 0; i < 2; ++i)
2143 LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
2144 CV_Assert(downLayerData);
2145 while (downLayerData->skip)
2147 if (downLayerData->inputBlobsId.size() == 1)
2148 downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
2155 if (downLayerData && ld.id == downLayerData->id)
2157 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
2161 CV_Assert(biasLayerData);
2163 if( eltwiseData->consumers.size() == 1 )
2165 // fuse eltwise + activation layer
2166 if (biasLayerData->id < ld.id)
2168 nextData = &layers[eltwiseData->consumers[0].lid];
2169 lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
2170 Ptr<ActivationLayer> nextActivLayer;
2172 nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2174 if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2175 (!nextData->type.compare("ReLU") ||
2176 !nextData->type.compare("ChannelsPReLU") ||
2177 !nextData->type.compare("Power")) &&
2178 currLayer->setActivation(nextActivLayer) )
2180 CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
2181 ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
2182 printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
2183 printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2184 eltwiseData->skip = true;
2185 nextData->skip = true;
2186 // This optimization for cases like
2192 // This way all the element-wise computations
2193 // (i.e. some_layer+conv or some_layer*conv)
2194 // would be done at [conv] layer. So we need to
2195 // replace [conv]'s output blob to [eltwise]'s one
2196 // considering that [activ] is an in-place layer.
2197 // Also we need to move all the consumers' references.
2198 // To prevent memory collisions (i.e. when input of
2199 // [conv] and output of [eltwise] is the same blob)
2200 // we allocate a new blob.
2201 CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
2202 ld.outputBlobs[0] = ld.outputBlobs[0].clone();
2203 ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
2205 eltwiseData->outputBlobs = ld.outputBlobs;
2206 nextData->outputBlobs = ld.outputBlobs;
2207 eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
2208 nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
2210 // Move references of [activ] layer consumers to the newly allocated blob.
2211 for (int i = 0; i < nextData->consumers.size(); ++i)
2213 LayerData& consumer = layers[nextData->consumers[i].lid];
2214 for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
2216 if (consumer.inputBlobsId[j].lid == lpNext.lid)
2218 consumer.inputBlobs[j] = &ld.outputBlobs[0];
2219 consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
2232 if (preferableBackend != DNN_BACKEND_OPENCV)
2233 continue; // Go to the next layer.
2235 // the optimization #2. if there is concat layer that concatenates channels
2236 // from the inputs together (i.e. axis == 1) then we make the inputs of
2237 // the concat layer to write to the concatenation output buffer
2238 // (and so we eliminate the concatenation layer, because the channels
2239 // are concatenated implicitly).
2240 Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
2241 if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
2242 ld.outputBlobs.size() == 1 )
2244 Mat& output = ld.outputBlobs[0];
2246 if (!ld.outputBlobsWrappers.empty() &&
2247 (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
2249 size_t i, ninputs = ld.inputBlobsId.size();
2250 bool conv_layer = true;
2251 for( i = 0; i < ninputs; i++ )
2253 LayerPin pin = ld.inputBlobsId[i];
2254 LayerData* inp_i_data = &layers[pin.lid];
2255 while(inp_i_data->skip &&
2256 inp_i_data->inputBlobsId.size() == 1 &&
2257 inp_i_data->consumers.size() == 1)
2259 pin = inp_i_data->inputBlobsId[0];
2260 inp_i_data = &layers[pin.lid];
2262 conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
2266 std::vector<UMat> umat_outputBlobs;
2267 umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2268 umat_output = umat_outputBlobs[0];
2271 // TODO: in general, this optimization can always be done, but
2272 // many layers currently check that the input/output blobs are
2273 // continuous arrays. Unfortunately, this is not true when
2274 // the concatenation optimization is applied with batch_size > 1.
2275 // so, for now, we only apply this optimization in the most popular
2276 // case batch_size == 1.
2277 if( output.dims == 4 && output.size[0] == 1 )
2279 size_t i, ninputs = ld.inputBlobsId.size();
2280 std::vector<LayerPin> realinputs(ninputs);
2281 for( i = 0; i < ninputs; i++ )
2283 LayerPin pin = ld.inputBlobsId[i];
2284 LayerData* inp_i_data = &layers[pin.lid];
2285 while(inp_i_data->skip &&
2286 inp_i_data->inputBlobsId.size() == 1 &&
2287 inp_i_data->consumers.size() == 1)
2289 pin = inp_i_data->inputBlobsId[0];
2290 inp_i_data = &layers[pin.lid];
2292 printf_(("\treal input for %s is %s\n",
2293 layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
2294 inp_i_data->getLayerInstance()->name.c_str()));
2296 if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
2298 realinputs[i] = pin;
2303 // Allocate new memory to prevent collisions during memory
2304 // reusing (see https://github.com/opencv/opencv/pull/10456).
2305 output = output.clone();
2306 if (preferableBackend == DNN_BACKEND_OPENCV &&
2307 IS_DNN_OPENCL_TARGET(preferableTarget))
2309 std::vector<UMat> umats(1);
2310 umat_output = umat_output.clone();
2311 umats[0] = umat_output;
2312 OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
2314 Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
2316 for( i = 0; i < ninputs; i++ )
2318 LayerPin pin = realinputs[i];
2319 LayerData* inp_i_data = &layers[pin.lid];
2320 int channels_i = ld.inputBlobs[i]->size[1];
2321 chrange[1] = Range(ofs, ofs + channels_i);
2322 printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
2323 pin.oid, ofs, ofs + channels_i));
2325 Mat output_slice = output(chrange);
2326 Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
2327 CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
2328 Mat* oldPtr = &curr_output;
2329 curr_output = output_slice;
2330 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2332 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
2333 umats[pin.oid] = umat_output(chrange);
2334 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
2336 // Layers that refer old input Mat will refer to the
2337 // new data but the same Mat object.
2338 CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
2341 printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
2348 void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
2350 CV_TRACE_FUNCTION();
2352 MapIdToLayerData::iterator it;
2353 for (it = layers.begin(); it != layers.end(); it++)
2354 it->second.flag = 0;
2356 CV_Assert(!layers[0].outputBlobs.empty());
2357 ShapesVec inputShapes;
2358 for(int i = 0; i < layers[0].outputBlobs.size(); i++)
2360 Mat& inp = layers[0].outputBlobs[i];
2361 CV_Assert(inp.total());
2362 if (preferableBackend == DNN_BACKEND_OPENCV &&
2363 preferableTarget == DNN_TARGET_OPENCL_FP16)
2365 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
2367 inputShapes.push_back(shape(inp));
2369 LayersShapesMap layersShapes;
2370 getLayersShapes(inputShapes, layersShapes);
2372 blobManager.reset();
2373 backendWrappers.clear();
2375 for(auto& layer : layers)
2377 auto& ld = layer.second;
2378 ld.inputBlobsWrappers.clear();
2379 ld.outputBlobsWrappers.clear();
2380 ld.internalBlobsWrappers.clear();
2383 // Fake references to input blobs.
2384 for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
2385 blobManager.addReference(LayerPin(0, i));
2386 for (it = layers.begin(); it != layers.end(); ++it)
2388 const LayerData& ld = it->second;
2389 blobManager.addReferences(ld.inputBlobsId);
2392 for (int i = 0; i < blobsToKeep_.size(); i++)
2394 blobManager.addReference(blobsToKeep_[i]);
2397 for (it = layers.begin(); it != layers.end(); it++)
2399 int lid = it->first;
2400 allocateLayer(lid, layersShapes);
2403 layersTimings.resize(lastLayerId + 1, 0);
2404 fuseLayers(blobsToKeep_);
2407 void forwardLayer(LayerData &ld)
2409 CV_TRACE_FUNCTION();
2411 Ptr<Layer> layer = ld.layerInstance;
2418 std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
2419 if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
2422 CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
2424 if (!layer->supportBackend(DNN_BACKEND_OPENCV))
2425 CV_Error(Error::StsNotImplemented, format("Layer \"%s\" of type \"%s\" unsupported on OpenCV backend",
2426 ld.name.c_str(), ld.type.c_str()));
2428 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2430 std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
2431 std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2432 std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
2433 layer->forward(umat_inputBlobs,
2435 umat_internalBlobs);
2436 if (DNN_CHECK_NAN_INF)
2439 for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2441 UMat& u = umat_outputBlobs[i];
2443 if (u.depth() == CV_16S) // FP16
2446 m = u.getMat(ACCESS_READ);
2449 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2450 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2453 else if (!checkRange(m, true, NULL, -1e6, 1e6))
2455 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2456 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2462 for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
2464 UMat& u = umat_inputBlobs[i];
2466 if (u.depth() == CV_16S) // FP16
2469 m = u.getMat(ACCESS_READ);
2470 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2471 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2473 for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2475 UMat& u = umat_outputBlobs[i];
2477 if (u.depth() == CV_16S) // FP16
2480 m = u.getMat(ACCESS_READ);
2481 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2482 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2484 for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
2486 UMat& u = umat_internalBlobs[i];
2488 if (u.depth() == CV_16S) // FP16
2491 m = u.getMat(ACCESS_READ);
2492 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
2493 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
2495 if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2499 OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
2503 for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
2505 if (!ld.inputBlobsWrappers[i].empty())
2506 ld.inputBlobsWrappers[i]->copyToHost();
2509 std::vector<Mat> inps(ld.inputBlobs.size());
2510 for (int i = 0; i < ld.inputBlobs.size(); ++i)
2512 inps[i] = *ld.inputBlobs[i];
2514 layer->forward(inps, ld.outputBlobs, ld.internals);
2516 if (DNN_CHECK_NAN_INF)
2519 for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2521 const Mat& m = ld.outputBlobs[i];
2524 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2525 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2528 else if (!checkRange(m, true, NULL, -1e6, 1e6))
2530 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2531 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2537 for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
2539 const Mat* pM = ld.inputBlobs[i];
2542 std::cout << "INPUT " << i << " is NULL" << std::endl;
2546 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2547 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2549 for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2551 const Mat& m = ld.outputBlobs[i];
2552 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2553 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2555 for (size_t i = 0; i < ld.internals.size(); ++i)
2557 const Mat& m = ld.internals[i];
2558 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2559 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2561 if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2566 for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
2568 if (!ld.outputBlobsWrappers[i].empty())
2569 ld.outputBlobsWrappers[i]->setHostDirty();
2575 Ptr<BackendNode> node = it->second;
2576 CV_Assert(!node.empty());
2577 if (preferableBackend == DNN_BACKEND_CUDA)
2579 CV_Assert(haveCUDA());
2582 Ptr<CUDABackendNode> cudaNode = node.dynamicCast<CUDABackendNode>();
2583 CV_Assert(!cudaNode.empty());
2585 cudaNode->forward(ld.inputBlobsWrappers, ld.outputBlobsWrappers, cudaInfo->workspace);
2588 else if (preferableBackend == DNN_BACKEND_HALIDE)
2590 forwardHalide(ld.outputBlobsWrappers, node);
2592 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
2594 forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
2596 else if (preferableBackend == DNN_BACKEND_VKCOM)
2600 forwardVkCom(ld.outputBlobsWrappers, node);
2602 catch (const cv::Exception& e)
2604 CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what());
2605 it->second = Ptr<BackendNode>();
2611 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
2619 layersTimings[ld.id] = tm.getTimeTicks();
2624 void forwardToLayer(LayerData &ld, bool clearFlags = true)
2626 CV_TRACE_FUNCTION();
2630 MapIdToLayerData::iterator it;
2631 for (it = layers.begin(); it != layers.end(); it++)
2632 it->second.flag = 0;
2635 //already was forwarded
2640 MapIdToLayerData::iterator it;
2641 for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
2643 LayerData &ld = it->second;
2653 if (preferableBackend == DNN_BACKEND_CUDA)
2654 cudaInfo->context.stream.synchronize();
2658 void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
2660 std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
2662 if (id == 0 && inOutShapes[id].in[0].empty())
2664 if (!layers[0].outputBlobs.empty())
2667 for (int i = 0; i < layers[0].outputBlobs.size(); i++)
2669 Mat& inp = layers[0].outputBlobs[i];
2670 CV_Assert(inp.total());
2671 shapes.push_back(shape(inp));
2673 inOutShapes[0].in = shapes;
2677 inOutShapes[0].out.clear();
2682 if (inOutShapes[id].in.empty())
2684 for(int i = 0; i < inputLayerIds.size(); i++)
2686 int layerId = inputLayerIds[i].lid;
2687 LayersShapesMap::iterator it =
2688 inOutShapes.find(layerId);
2689 if(it == inOutShapes.end() ||
2690 it->second.out.empty())
2692 getLayerShapesRecursively(layerId, inOutShapes);
2694 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
2695 inOutShapes[id].in.push_back(shape);
2698 const ShapesVec& is = inOutShapes[id].in;
2699 ShapesVec& os = inOutShapes[id].out;
2700 ShapesVec& ints = inOutShapes[id].internal;
2701 int requiredOutputs = layers[id].requiredOutputs.size();
2702 inOutShapes[id].supportInPlace =
2703 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
2705 for (int i = 0; i < ints.size(); i++)
2706 CV_Assert(total(ints[i]) > 0);
2708 for (int i = 0; i < os.size(); i++)
2709 CV_Assert(total(os[i]) > 0);
2712 void getLayersShapes(const ShapesVec& netInputShapes,
2713 LayersShapesMap& inOutShapes)
2715 inOutShapes.clear();
2717 inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2718 for (MapIdToLayerData::iterator it = layers.begin();
2719 it != layers.end(); it++)
2721 getLayerShapesRecursively(it->first, inOutShapes);
2725 void getLayerShapes(const ShapesVec& netInputShapes,
2727 LayerShapes& shapes)
2729 LayersShapesMap inOutShapes;
2730 inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2731 getLayerShapesRecursively(layerId, inOutShapes);
2732 shapes = inOutShapes[layerId];
2735 LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
2737 return *std::max_element(pins.begin(), pins.end());
2740 Mat getBlob(const LayerPin& pin)
2742 CV_TRACE_FUNCTION();
2745 CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2747 LayerData &ld = layers[pin.lid];
2748 if ((size_t)pin.oid >= ld.outputBlobs.size())
2750 CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %zu outputs, "
2751 "the #%d was requested", ld.name.c_str(),
2752 ld.outputBlobs.size(), pin.oid));
2754 if (preferableTarget != DNN_TARGET_CPU)
2756 CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2757 // Transfer data to CPU if it's require.
2758 ld.outputBlobsWrappers[pin.oid]->copyToHost();
2761 if (ld.outputBlobs[pin.oid].depth() == CV_16S)
2763 convertFp16(ld.outputBlobs[pin.oid], output_blob);
2767 return ld.outputBlobs[pin.oid];
2770 Mat getBlob(String outputName)
2772 return getBlob(getPinByAlias(outputName));
2776 AsyncArray getBlobAsync(const LayerPin& pin)
2778 CV_TRACE_FUNCTION();
2779 #ifdef HAVE_INF_ENGINE
2781 CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2783 LayerData &ld = layers[pin.lid];
2784 if ((size_t)pin.oid >= ld.outputBlobs.size())
2786 CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
2787 "the #%d was requested", ld.name.c_str(),
2788 (int)ld.outputBlobs.size(), (int)pin.oid));
2790 if (preferableTarget != DNN_TARGET_CPU)
2792 CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2793 // Transfer data to CPU if it's require.
2794 ld.outputBlobsWrappers[pin.oid]->copyToHost();
2796 CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
2798 Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
2799 return std::move(wrapper->futureMat);
2801 CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
2805 AsyncArray getBlobAsync(String outputName)
2807 return getBlobAsync(getPinByAlias(outputName));
2812 Net::Net() : impl(new Net::Impl)
2816 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
2818 #ifndef HAVE_INF_ENGINE
2819 CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
2821 InferenceEngine::CNNNetReader reader;
2822 reader.ReadNetwork(xml);
2823 reader.ReadWeights(bin);
2825 InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
2827 std::vector<String> inputsNames;
2828 std::vector<MatShape> inp_shapes;
2829 for (auto& it : ieNet.getInputsInfo())
2831 inputsNames.push_back(it.first);
2832 std::vector<size_t> dims = it.second->getTensorDesc().getDims();
2833 inp_shapes.push_back(std::vector<int>(dims.begin(), dims.end()));
2837 cvNet.setInputsNames(inputsNames);
2839 // set empty input to determine input shapes
2840 for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
2842 cvNet.setInput(Mat(inp_shapes[inp_id], CV_32F), inputsNames[inp_id]);
2845 Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
2846 backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
2847 for (auto& it : ieNet.getOutputsInfo())
2849 Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
2850 InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
2854 int lid = cvNet.addLayer(it.first, "", lp);
2856 LayerData& ld = cvNet.impl->layers[lid];
2857 cvLayer->name = it.first;
2858 cvLayer->type = ieLayer->type;
2859 ld.layerInstance = cvLayer;
2860 ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
2862 for (int i = 0; i < inputsNames.size(); ++i)
2863 cvNet.connect(0, i, lid, i);
2865 cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
2867 cvNet.impl->skipInfEngineInit = true;
2869 #endif // HAVE_INF_ENGINE
2876 int Net::addLayer(const String &name, const String &type, LayerParams ¶ms)
2878 CV_TRACE_FUNCTION();
2880 if (impl->getLayerId(name) >= 0)
2882 CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
2886 int id = ++impl->lastLayerId;
2887 impl->layerNameToId.insert(std::make_pair(name, id));
2888 impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
2893 int Net::addLayerToPrev(const String &name, const String &type, LayerParams ¶ms)
2895 CV_TRACE_FUNCTION();
2897 int prvLid = impl->lastLayerId;
2898 int newLid = this->addLayer(name, type, params);
2899 this->connect(prvLid, 0, newLid, 0);
2903 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
2905 CV_TRACE_FUNCTION();
2907 impl->connect(outLayerId, outNum, inpLayerId, inpNum);
2910 void Net::connect(String _outPin, String _inPin)
2912 CV_TRACE_FUNCTION();
2914 LayerPin outPin = impl->getPinByAlias(_outPin);
2915 LayerPin inpPin = impl->getPinByAlias(_inPin);
2917 CV_Assert(outPin.valid() && inpPin.valid());
2919 impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
2922 Mat Net::forward(const String& outputName)
2924 CV_TRACE_FUNCTION();
2926 String layerName = outputName;
2928 if (layerName.empty())
2929 layerName = getLayerNames().back();
2931 std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2932 impl->setUpNet(pins);
2933 impl->forwardToLayer(impl->getLayerData(layerName));
2935 return impl->getBlob(layerName);
2938 AsyncArray Net::forwardAsync(const String& outputName)
2940 CV_TRACE_FUNCTION();
2942 String layerName = outputName;
2944 if (layerName.empty())
2945 layerName = getLayerNames().back();
2947 std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2948 impl->setUpNet(pins);
2950 if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
2951 CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
2953 impl->isAsync = true;
2954 impl->forwardToLayer(impl->getLayerData(layerName));
2955 impl->isAsync = false;
2957 return impl->getBlobAsync(layerName);
2959 CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
2963 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
2965 CV_TRACE_FUNCTION();
2967 String layerName = outputName;
2969 if (layerName.empty())
2970 layerName = getLayerNames().back();
2972 std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2973 impl->setUpNet(pins);
2974 impl->forwardToLayer(impl->getLayerData(layerName));
2976 LayerPin pin = impl->getPinByAlias(layerName);
2977 LayerData &ld = impl->layers[pin.lid];
2979 if (outputBlobs.isUMat())
2981 impl->getBlob(layerName).copyTo(outputBlobs);
2983 else if (outputBlobs.isMat())
2985 outputBlobs.assign(impl->getBlob(layerName));
2987 else if (outputBlobs.isMatVector())
2989 if (impl->preferableTarget != DNN_TARGET_CPU)
2991 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2993 CV_Assert(!ld.outputBlobsWrappers[i].empty());
2994 ld.outputBlobsWrappers[i]->copyToHost();
2997 if (ld.outputBlobs[0].depth() == CV_32F)
2999 std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
3000 outputvec = ld.outputBlobs;
3002 std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
3003 outputvec.resize(ld.outputBlobs.size());
3004 for (int i = 0; i < outputvec.size(); i++)
3005 convertFp16(ld.outputBlobs[i], outputvec[i]);
3008 else if (outputBlobs.isUMatVector())
3010 std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
3012 if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
3013 IS_DNN_OPENCL_TARGET(impl->preferableTarget))
3015 if (impl->preferableTarget == DNN_TARGET_OPENCL)
3016 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
3017 else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
3019 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
3020 outputvec.resize(out_vec.size());
3021 for (int i = 0; i < out_vec.size(); i++)
3022 convertFp16(out_vec[i], outputvec[i]);
3027 outputvec.resize(ld.outputBlobs.size());
3028 for (int i = 0; i < outputvec.size(); ++i)
3029 ld.outputBlobs[i].copyTo(outputvec[i]);
3034 void Net::forward(OutputArrayOfArrays outputBlobs,
3035 const std::vector<String>& outBlobNames)
3037 CV_TRACE_FUNCTION();
3039 std::vector<LayerPin> pins;
3040 for (int i = 0; i < outBlobNames.size(); i++)
3042 pins.push_back(impl->getPinByAlias(outBlobNames[i]));
3045 impl->setUpNet(pins);
3047 LayerPin out = impl->getLatestLayerPin(pins);
3049 impl->forwardToLayer(impl->getLayerData(out.lid));
3051 std::vector<Mat> matvec;
3052 for (int i = 0; i < pins.size(); i++)
3054 matvec.push_back(impl->getBlob(pins[i]));
3057 std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
3061 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
3062 const std::vector<String>& outBlobNames)
3064 CV_TRACE_FUNCTION();
3066 std::vector<LayerPin> pins;
3067 for (int i = 0; i < outBlobNames.size(); i++)
3069 pins.push_back(impl->getPinByAlias(outBlobNames[i]));
3072 impl->setUpNet(pins);
3074 LayerPin out = impl->getLatestLayerPin(pins);
3076 impl->forwardToLayer(impl->getLayerData(out.lid));
3078 outputBlobs.resize(outBlobNames.size());
3079 for (int i = 0; i < outBlobNames.size(); i++)
3081 std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
3082 outputBlobs[i].resize(lp.size());
3083 for (int j = 0; j < lp.size(); j++)
3085 outputBlobs[i][j] = impl->getBlob(lp[j]);
3090 void Net::setPreferableBackend(int backendId)
3092 CV_TRACE_FUNCTION();
3093 CV_TRACE_ARG(backendId);
3095 if( impl->preferableBackend != backendId )
3097 impl->preferableBackend = backendId;
3098 impl->netWasAllocated = false;
3103 void Net::setPreferableTarget(int targetId)
3105 CV_TRACE_FUNCTION();
3106 CV_TRACE_ARG(targetId);
3108 if( impl->preferableTarget != targetId )
3110 impl->preferableTarget = targetId;
3111 if (IS_DNN_OPENCL_TARGET(targetId))
3114 #ifdef HAVE_INF_ENGINE
3115 if (impl->preferableBackend == DNN_BACKEND_OPENCV)
3117 if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
3118 impl->preferableBackend == DNN_BACKEND_OPENCV)
3119 #endif // HAVE_INF_ENGINE
3120 impl->preferableTarget = DNN_TARGET_CPU;
3122 bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
3123 if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
3124 impl->preferableTarget = DNN_TARGET_OPENCL;
3127 impl->netWasAllocated = false;
3132 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
3134 CV_TRACE_FUNCTION();
3136 impl->netInputLayer->setNames(inputBlobNames);
3139 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
3141 CV_TRACE_FUNCTION();
3142 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3146 pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
3149 CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
3151 LayerData &ld = impl->layers[pin.lid];
3152 const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
3153 ld.outputBlobs.resize(numInputs);
3154 ld.outputBlobsWrappers.resize(numInputs);
3155 impl->netInputLayer->inputsData.resize(numInputs);
3156 impl->netInputLayer->scaleFactors.resize(numInputs);
3157 impl->netInputLayer->means.resize(numInputs);
3159 MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
3160 Mat blob_ = blob.getMat();
3161 bool oldShape = prevShape == shape(blob_);
3164 blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
3168 ld.outputBlobs[pin.oid] = blob_.clone();
3169 impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
3172 if (!ld.outputBlobsWrappers[pin.oid].empty())
3174 ld.outputBlobsWrappers[pin.oid]->setHostDirty();
3176 impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
3177 impl->netInputLayer->means[pin.oid] = mean;
3178 impl->netWasAllocated = impl->netWasAllocated && oldShape;
3181 Mat Net::getParam(LayerId layer, int numParam)
3183 LayerData &ld = impl->getLayerData(layer);
3184 std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3185 CV_Assert(numParam < (int)layerBlobs.size());
3186 return layerBlobs[numParam];
3189 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
3191 LayerData &ld = impl->getLayerData(layer);
3193 std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3194 CV_Assert(numParam < (int)layerBlobs.size());
3195 //we don't make strong checks, use this function carefully
3196 layerBlobs[numParam] = blob;
3199 int Net::getLayerId(const String &layer)
3201 return impl->getLayerId(layer);
3204 String parseLayerParams(const String& name, const LayerParams& lp) {
3205 DictValue param = lp.get(name);
3206 std::ostringstream out;
3208 switch (param.size()) {
3209 case 1: out << ": "; break;
3210 case 2: out << "(HxW): "; break;
3211 case 3: out << "(DxHxW): "; break;
3212 default: CV_Error(Error::StsNotImplemented, format("Unsupported %s size = %d", name.c_str(), param.size()));
3214 for (size_t i = 0; i < param.size() - 1; i++) {
3215 out << param.get<int>(i) << " x ";
3217 out << param.get<int>(param.size() - 1) << "\\l";
3223 CV_Assert(!empty());
3225 if (impl->netInputLayer->inputsData.empty())
3226 CV_Error(Error::StsError, "Requested set input");
3228 if (!impl->netWasAllocated)
3231 std::ostringstream out;
3232 std::map<int, LayerData>& map = impl->layers;
3233 int prefBackend = impl->preferableBackend;
3234 std::vector<std::vector<int> > skippedLayers;
3235 std::vector<int> skipId;
3236 std::vector<int> allLayers(map.size(), -1);
3238 Ptr<BackendNode> prevNode;
3239 for (std::map<int, LayerData>::reverse_iterator rit = map.rbegin(); rit != map.rend(); ++rit)
3241 std::map<int, Ptr<BackendNode> >::iterator itBackend = rit->second.backendNodes.find(prefBackend);
3242 if (prefBackend == DNN_BACKEND_OPENCV || itBackend == rit->second.backendNodes.end() ||
3243 itBackend->second.empty())
3245 if (rit->second.skip)
3246 skipId.push_back(rit->first);
3247 else if (!skipId.empty())
3249 if (prefBackend == DNN_BACKEND_OPENCV || prevNode.empty())
3250 skipId.push_back(rit->first);
3251 else if (idPrev != -1)
3252 skipId.push_back(idPrev);
3254 std::sort(skipId.begin(), skipId.end());
3255 for (int i = 0; i < skipId.size(); i++) {
3256 allLayers[skipId[i]] = skippedLayers.size();
3258 skippedLayers.push_back(skipId);
3264 if (itBackend->second == prevNode)
3265 skipId.push_back(idPrev);
3266 else if (!skipId.empty())
3268 skipId.push_back(idPrev);
3269 std::sort(skipId.begin(), skipId.end());
3270 for (int i = 0; i < skipId.size(); i++) {
3271 allLayers[skipId[i]] = skippedLayers.size();
3273 skippedLayers.push_back(skipId);
3276 idPrev = rit->first;
3277 prevNode = itBackend->second;
3280 String colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462", "#ff4848"};
3282 switch (prefBackend) {
3283 case DNN_BACKEND_DEFAULT: backend = "DEFAULT/"; break;
3284 case DNN_BACKEND_HALIDE: backend = "HALIDE/"; break;
3285 case DNN_BACKEND_INFERENCE_ENGINE: backend = "DLIE/"; break;
3286 case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
3287 case DNN_BACKEND_CUDA: backend = "CUDA/"; break;
3289 out << "digraph G {" << '\n';
3291 for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3293 String name = it->second.params.name;
3294 if (allLayers[it->first] == -1 && !name.empty()) {
3295 out << " " << "\"" << name << "\"" << " [label=\"";
3297 skipId.push_back(it->first);
3299 else if (name.empty() || it->first != skippedLayers[allLayers[it->first]][0])
3301 else { // first node in cluster : it->first == skippedLayers[allLayers[it->first]][0]
3302 int cluster = allLayers[it->first];
3303 out << " " << "\"" << "cluster_" << cluster << "\"" << " [label=\"{";
3304 skipId = skippedLayers[allLayers[it->first]]; // vertices in current cluster
3306 for (int i = 0; i < skipId.size(); i++)
3308 LayerParams& lp = map[skipId[i]].params;
3309 if (!lp.name.empty()) {
3313 out << lp.name << "\\n" << lp.type << "\\n";
3314 if (lp.has("kernel_size")) {
3315 String kernel = parseLayerParams("kernel_size", lp);
3317 } else if (lp.has("kernel_h") && lp.has("kernel_w")) {
3318 DictValue h = lp.get("kernel_h");
3319 DictValue w = lp.get("kernel_w");
3320 out << "kernel (HxW): " << h << " x " << w << "\\l";
3322 if (lp.has("stride")) {
3323 String stride = parseLayerParams("stride", lp);
3325 } else if (lp.has("stride_h") && lp.has("stride_w")) {
3326 DictValue h = lp.get("stride_h");
3327 DictValue w = lp.get("stride_w");
3328 out << "stride (HxW): " << h << " x " << w << "\\l";
3330 if (lp.has("dilation")) {
3331 String dilation = parseLayerParams("dilation", lp);
3333 } else if (lp.has("dilation_h") && lp.has("dilation_w")) {
3334 DictValue h = lp.get("dilation_h");
3335 DictValue w = lp.get("dilation_w");
3336 out << "dilation (HxW): " << h << " x " << w << "\\l";
3338 if (lp.has("pad")) {
3339 DictValue pad = lp.get("pad");
3341 switch (pad.size()) {
3342 case 1: out << ": " << pad << "\\l"; break;
3343 case 2: out << "(HxW): (" << pad.get<int>(0) << " x " << pad.get<int>(1) << ")" << "\\l"; break;
3344 case 4: out << "(HxW): (" << pad.get<int>(0) << ", " << pad.get<int>(2) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(3) << ")" << "\\l"; break;
3345 case 6: out << "(DxHxW): (" << pad.get<int>(0) << ", " << pad.get<int>(3) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(4)
3346 << ") x (" << pad.get<int>(2) << ", " << pad.get<int>(5) << ")" << "\\l"; break;
3347 default: CV_Error(Error::StsNotImplemented, format("Unsupported pad size = %d", pad.size()));
3349 } else if (lp.has("pad_l") && lp.has("pad_t") && lp.has("pad_r") && lp.has("pad_b")) {
3350 DictValue l = lp.get("pad_l");
3351 DictValue t = lp.get("pad_t");
3352 DictValue r = lp.get("pad_r");
3353 DictValue b = lp.get("pad_b");
3354 out << "pad (HxW): (" << t << ", " << b << ") x (" << l << ", " << r << ")" << "\\l";
3356 else if (lp.has("pooled_w") || lp.has("pooled_h")) {
3357 DictValue h = lp.get("pooled_h");
3358 DictValue w = lp.get("pooled_w");
3359 out << "pad (HxW): " << h << " x " << w << "\\l";
3361 if (lp.has("pool")) {
3362 out << "pool: " << lp.get("pool") << "\\l";
3364 if (lp.has("global_pooling")) {
3365 out << "global_pooling: " << lp.get("global_pooling") << "\\l";
3367 if (lp.has("group")) {
3368 out << "group: " << lp.get("group") << "\\l";
3372 if (!it->second.outputBlobs.empty())
3373 out << "output: " << it->second.outputBlobs[0].size << "\\l";
3375 Ptr<BackendNode> layerBackend = it->second.backendNodes[prefBackend];
3376 out << (!layerBackend.empty() ? backend : "OCV/");
3378 switch (it->second.layerInstance->preferableTarget) {
3379 case DNN_TARGET_CPU: out << "CPU\\n"; colorId = layerBackend.empty() ? 0 : 5; break;
3380 case DNN_TARGET_OPENCL: out << "OCL\\n"; colorId = 1; break;
3381 case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16\\n"; colorId = 2; break;
3382 case DNN_TARGET_MYRIAD: out << "MYRIAD\\n"; colorId = 3; break;
3383 case DNN_TARGET_FPGA: out << "FPGA\\n"; colorId = 4; break;
3384 case DNN_TARGET_CUDA: out << "CUDA\\n"; colorId = 5; break;
3385 case DNN_TARGET_CUDA_FP16: out << "CUDA_FP16\\n"; colorId = 6; break;
3387 out << ((skipId.size() == 1)? "\" " : " }\" ");
3388 out << "fillcolor=\"" << colors[colorId] << "\" ";
3389 out << "style=filled ";
3390 out << "shape=" << ((skipId.size() == 1)? "box" : "record") << "]" << '\n';
3394 int inputsSize = impl->netInputLayer->outNames.size();
3395 for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3397 if (allLayers[it->first] == -1) // node
3399 for (int i = 0; i < it->second.consumers.size(); i++)
3401 int outId = it->second.consumers[i].lid;
3402 if (it == map.begin() && inputsSize > 1)
3403 out << " " << "\"" << it->second.name << "_" << i << "\"" << " -> ";
3405 out << " " << "\"" << it->second.name << "\"" << " -> ";
3406 if (allLayers[outId] == -1) // node
3407 out << "\"" << map[outId].name << "\"" << '\n';
3409 out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3412 else if (it->first == skippedLayers[allLayers[it->first]].back()) // edges from last layer in cluster
3414 for (int i = 0; i < it->second.consumers.size(); i++)
3416 int outId = it->second.consumers[i].lid;
3417 if (allLayers[outId] == -1) { // node
3418 out << " " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3419 out << "\"" << map[outId].name << "\"" << '\n';
3421 else if (allLayers[outId] != allLayers[it->first]) { // another cluster
3422 out << " " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3423 out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3432 void Net::dumpToFile(const String& path) {
3433 std::ofstream file(path.c_str());
3438 Ptr<Layer> Net::getLayer(LayerId layerId)
3440 LayerData &ld = impl->getLayerData(layerId);
3441 return ld.getLayerInstance();
3444 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
3446 LayerData &ld = impl->getLayerData(layerId);
3447 if (!ld.layerInstance)
3448 CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str()));
3450 std::vector<Ptr<Layer> > inputLayers;
3451 inputLayers.reserve(ld.inputLayersId.size());
3452 std::set<int>::iterator it;
3453 for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) {
3454 inputLayers.push_back(getLayer(*it));
3459 std::vector<String> Net::getLayerNames() const
3461 std::vector<String> res;
3462 res.reserve(impl->layers.size());
3464 Impl::MapIdToLayerData::iterator it;
3465 for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3467 if (it->second.id) //skip Data layer
3468 res.push_back(it->second.name);
3474 bool Net::empty() const
3476 return impl->layers.size() <= 1; //first layer is default Data layer
3479 std::vector<int> Net::getUnconnectedOutLayers() const
3481 std::vector<int> layersIds;
3483 Impl::MapIdToLayerData::iterator it;
3484 for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3486 int lid = it->first;
3487 LayerData &ld = it->second;
3489 if (ld.requiredOutputs.size() == 0)
3490 layersIds.push_back(lid);
3496 std::vector<String> Net::getUnconnectedOutLayersNames() const
3498 std::vector<int> ids = getUnconnectedOutLayers();
3499 const size_t n = ids.size();
3500 std::vector<String> names(n);
3501 for (size_t i = 0; i < n; ++i)
3503 names[i] = impl->layers[ids[i]].name;
3508 void Net::getLayersShapes(const ShapesVec& netInputShapes,
3509 std::vector<int>& layersIds,
3510 std::vector<ShapesVec>& inLayersShapes,
3511 std::vector<ShapesVec>& outLayersShapes) const
3514 inLayersShapes.clear();
3515 outLayersShapes.clear();
3517 Impl::LayersShapesMap inOutShapes;
3518 impl->getLayersShapes(netInputShapes, inOutShapes);
3520 for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
3521 it != inOutShapes.end(); it++)
3523 layersIds.push_back(it->first);
3524 inLayersShapes.push_back(it->second.in);
3525 outLayersShapes.push_back(it->second.out);
3529 void Net::getLayersShapes(const MatShape& netInputShape,
3530 std::vector<int>& layerIds,
3531 std::vector<ShapesVec>& inLayersShapes,
3532 std::vector<ShapesVec>& outLayersShapes) const
3534 getLayersShapes(ShapesVec(1, netInputShape),
3535 layerIds, inLayersShapes, outLayersShapes);
3538 void Net::getLayerShapes(const MatShape& netInputShape,
3540 ShapesVec& inLayerShapes,
3541 ShapesVec& outLayerShapes) const
3543 getLayerShapes(ShapesVec(1, netInputShape),
3544 layerId, inLayerShapes, outLayerShapes);
3548 void Net::getLayerShapes(const ShapesVec& netInputShapes,
3550 ShapesVec& inLayerShapes,
3551 ShapesVec& outLayerShapes) const
3554 impl->getLayerShapes(netInputShapes, layerId, shapes);
3555 inLayerShapes = shapes.in;
3556 outLayerShapes = shapes.out;
3559 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
3561 CV_TRACE_FUNCTION();
3564 std::vector<int> ids;
3565 std::vector<std::vector<MatShape> > inShapes, outShapes;
3566 getLayersShapes(netInputShapes, ids, inShapes, outShapes);
3567 CV_Assert(inShapes.size() == outShapes.size());
3568 CV_Assert(inShapes.size() == ids.size());
3570 for(int i = 0; i < ids.size(); i++)
3572 flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
3579 int64 Net::getFLOPS(const MatShape& netInputShape) const
3581 return getFLOPS(std::vector<MatShape>(1, netInputShape));
3584 int64 Net::getFLOPS(const int layerId,
3585 const std::vector<MatShape>& netInputShapes) const
3587 Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3588 CV_Assert(layer != impl->layers.end());
3591 impl->getLayerShapes(netInputShapes, layerId, shapes);
3593 return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
3596 int64 Net::getFLOPS(const int layerId,
3597 const MatShape& netInputShape) const
3599 return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
3602 void Net::getLayerTypes(std::vector<String>& layersTypes) const
3604 layersTypes.clear();
3606 std::map<String, int> layers;
3607 for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3608 it != impl->layers.end(); it++)
3610 if (layers.find(it->second.type) == layers.end())
3611 layers[it->second.type] = 0;
3612 layers[it->second.type]++;
3615 for (std::map<String, int>::iterator it = layers.begin();
3616 it != layers.end(); it++)
3618 layersTypes.push_back(it->first);
3622 int Net::getLayersCount(const String& layerType) const
3625 for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3626 it != impl->layers.end(); it++)
3628 if (it->second.type == layerType)
3634 void Net::getMemoryConsumption(const int layerId,
3635 const std::vector<MatShape>& netInputShapes,
3636 size_t& weights, size_t& blobs) const
3638 CV_TRACE_FUNCTION();
3640 Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3641 CV_Assert(layer != impl->layers.end());
3643 weights = blobs = 0;
3645 for(int i = 0; i < layer->second.params.blobs.size(); i++)
3647 const Mat& weightsBlob = layer->second.params.blobs[i];
3648 weights += weightsBlob.total()*weightsBlob.elemSize();
3651 ShapesVec inLayerShapes, outLayerShapes;
3652 getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
3653 for(int i = 0; i < outLayerShapes.size(); i++)
3655 blobs += total(outLayerShapes[i]) * sizeof(float);
3659 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3660 size_t& weights, size_t& blobs) const
3662 CV_TRACE_FUNCTION();
3664 std::vector<int> layerIds;
3665 std::vector<size_t> w, b;
3666 getMemoryConsumption(netInputShapes, layerIds, w, b);
3668 weights = blobs = 0;
3669 for(int i = 0; i < layerIds.size(); i++)
3676 void Net::getMemoryConsumption(const int layerId,
3677 const MatShape& netInputShape,
3678 size_t& weights, size_t& blobs) const
3680 getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
3684 void Net::getMemoryConsumption(const MatShape& netInputShape,
3685 size_t& weights, size_t& blobs) const
3687 getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
3691 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3692 std::vector<int>& layerIds, std::vector<size_t>& weights,
3693 std::vector<size_t>& blobs) const
3695 CV_TRACE_FUNCTION();
3701 std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
3703 getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
3705 for(int i = 0; i < layerIds.size(); i++)
3708 Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
3709 CV_Assert(layer != impl->layers.end());
3711 for(int j = 0; j < layer->second.params.blobs.size(); j++)
3713 const Mat& weightsBlob = layer->second.params.blobs[j];
3714 w += weightsBlob.total()*weightsBlob.elemSize();
3717 for(int j = 0; j < outLayerShapes[i].size(); j++)
3719 b += total(outLayerShapes[i][j]) * sizeof(float);
3722 weights.push_back(w);
3727 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
3728 std::vector<size_t>& weights, std::vector<size_t>& blobs) const
3730 getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
3734 void Net::enableFusion(bool fusion)
3736 if( impl->fusion != fusion )
3738 impl->fusion = fusion;
3739 impl->netWasAllocated = false;
3744 void Net::setHalideScheduler(const String& scheduler)
3746 CV_TRACE_FUNCTION();
3747 CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
3749 impl->halideConfigFile = scheduler;
3752 int64 Net::getPerfProfile(std::vector<double>& timings)
3754 timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
3755 int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
3759 //////////////////////////////////////////////////////////////////////////
3761 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
3763 Layer::Layer(const LayerParams ¶ms)
3764 : blobs(params.blobs), name(params.name), type(params.type)
3766 preferableTarget = DNN_TARGET_CPU;
3769 void Layer::setParamsFrom(const LayerParams ¶ms)
3771 blobs = params.blobs;
3776 int Layer::inputNameToIndex(String)
3781 int Layer::outputNameToIndex(const String&)
3786 bool Layer::supportBackend(int backendId)
3788 return backendId == DNN_BACKEND_OPENCV;
3791 Ptr<BackendNode> Layer::initCUDA(
3793 const std::vector<Ptr<BackendWrapper>>&,
3794 const std::vector<Ptr<BackendWrapper>>&)
3796 CV_Error(Error::StsNotImplemented, "CUDA pipeline of " + type +
3797 " layers is not defined.");
3798 return Ptr<BackendNode>();
3801 Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
3803 CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
3804 " layers is not defined.");
3805 return Ptr<BackendNode>();
3808 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
3810 CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
3811 " layers is not defined.");
3812 return Ptr<BackendNode>();
3815 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
3817 CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
3818 " layers is not defined.");
3819 return Ptr<BackendNode>();
3822 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
3823 const std::vector<Mat> &outputs, int targetId) const
3826 CV_TRACE_FUNCTION();
3828 Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
3829 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
3830 Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
3832 int outW, outH, outC, outN;
3833 getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
3835 if (targetId == DNN_TARGET_CPU)
3837 if (outW == 1 && outH == 1)
3839 if (outC + outN == 1)
3843 top.split(c, co, ci, 8)
3844 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3848 top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
3855 top.reorder(x, c, y)
3856 .split(y, yo, yi, 2)
3860 .vectorize(x, outW >= 16 ? 16 : outW);
3864 else if (targetId == DNN_TARGET_OPENCL)
3866 if (outW == 1 && outH == 1)
3868 int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
3869 top.split(c, co, ci, c_split)
3870 .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3876 int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
3877 int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
3878 // Supported vectorization widths: 2, 3, 4, 8, 16
3879 int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
3880 top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
3881 .split(c, co, ci, c_split)
3882 .gpu_blocks(xo, yo, co)
3883 .gpu_threads(xi, yi)
3884 .reorder(xi, yi, ci, xo, yo, co)
3889 CV_Error(Error::StsNotImplemented, "Unknown target identifier");
3890 #endif // HAVE_HALIDE
3893 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
3895 return Ptr<BackendNode>();
3898 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
3899 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
3900 void Layer::getScaleShift(Mat& scale, Mat& shift) const
3906 void Layer::unsetAttached()
3908 setActivation(Ptr<ActivationLayer>());
3911 template <typename T>
3912 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
3914 pv.resize(v.size());
3915 for (size_t i = 0; i < v.size(); i++)
3916 pv[i] = const_cast<T*>(&v[i]);
3919 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
3921 CV_TRACE_FUNCTION();
3922 this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
3925 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
3927 CV_UNUSED(input);CV_UNUSED(output);
3930 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
3932 CV_TRACE_FUNCTION();
3933 std::vector<Mat> inputs, outputs;
3934 inputs_arr.getMatVector(inputs);
3935 outputs_arr.getMatVector(outputs);
3937 std::vector<Mat*> inputsp;
3938 vecToPVec(inputs, inputsp);
3939 this->finalize(inputsp, outputs);
3942 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
3944 CV_TRACE_FUNCTION();
3946 std::vector<Mat> outputs;
3947 this->finalize(inputs, outputs);
3951 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
3953 // We kept this method for compatibility. DNN calls it now only to support users' implementations.
3956 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3958 CV_TRACE_FUNCTION();
3959 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3961 Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
3964 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3966 CV_TRACE_FUNCTION();
3967 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3969 if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
3971 std::vector<UMat> inputs;
3972 std::vector<UMat> outputs;
3973 std::vector<UMat> internals;
3975 std::vector<UMat> orig_inputs;
3976 std::vector<UMat> orig_outputs;
3977 std::vector<UMat> orig_internals;
3979 inputs_arr.getUMatVector(orig_inputs);
3980 outputs_arr.getUMatVector(orig_outputs);
3981 internals_arr.getUMatVector(orig_internals);
3983 inputs.resize(orig_inputs.size());
3984 for (size_t i = 0; i < orig_inputs.size(); i++)
3985 convertFp16(orig_inputs[i], inputs[i]);
3987 outputs.resize(orig_outputs.size());
3988 for (size_t i = 0; i < orig_outputs.size(); i++)
3989 outputs[i].create(shape(orig_outputs[i]), CV_32F);
3991 internals.resize(orig_internals.size());
3992 for (size_t i = 0; i < orig_internals.size(); i++)
3993 internals[i].create(shape(orig_internals[i]), CV_32F);
3995 forward(inputs, outputs, internals);
3997 for (size_t i = 0; i < outputs.size(); i++)
3998 convertFp16(outputs[i], orig_outputs[i]);
4000 // sync results back
4001 outputs_arr.assign(orig_outputs);
4002 internals_arr.assign(orig_internals);
4005 std::vector<Mat> inpvec;
4006 std::vector<Mat> outputs;
4007 std::vector<Mat> internals;
4009 inputs_arr.getMatVector(inpvec);
4010 outputs_arr.getMatVector(outputs);
4011 internals_arr.getMatVector(internals);
4013 std::vector<Mat*> inputs(inpvec.size());
4014 for (int i = 0; i < inpvec.size(); i++)
4015 inputs[i] = &inpvec[i];
4017 this->forward(inputs, outputs, internals);
4019 // sync results back
4020 outputs_arr.assign(outputs);
4021 internals_arr.assign(internals);
4024 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
4026 CV_TRACE_FUNCTION();
4028 this->finalize(inputs, outputs);
4029 this->forward(inputs, outputs, internals);
4034 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
4035 const int requiredOutputs,
4036 std::vector<MatShape> &outputs,
4037 std::vector<MatShape> &internals) const
4039 CV_Assert(inputs.size());
4040 outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
4044 //////////////////////////////////////////////////////////////////////////
4046 static Mutex& getLayerFactoryMutex()
4048 static Mutex* volatile instance = NULL;
4049 if (instance == NULL)
4051 cv::AutoLock lock(getInitializationMutex());
4052 if (instance == NULL)
4053 instance = new Mutex();
4058 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
4060 static LayerFactory_Impl& getLayerFactoryImpl_()
4062 static LayerFactory_Impl impl;
4066 static LayerFactory_Impl& getLayerFactoryImpl()
4068 static LayerFactory_Impl* volatile instance = NULL;
4069 if (instance == NULL)
4071 cv::AutoLock lock(getLayerFactoryMutex());
4072 if (instance == NULL)
4074 instance = &getLayerFactoryImpl_();
4075 initializeLayerFactory();
4081 void LayerFactory::registerLayer(const String &type, Constructor constructor)
4083 CV_TRACE_FUNCTION();
4084 CV_TRACE_ARG_VALUE(type, "type", type.c_str());
4086 cv::AutoLock lock(getLayerFactoryMutex());
4087 String type_ = toLowerCase(type);
4088 LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
4090 if (it != getLayerFactoryImpl().end())
4092 if (it->second.back() == constructor)
4093 CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered");
4094 it->second.push_back(constructor);
4096 getLayerFactoryImpl().insert(std::make_pair(type_, std::vector<Constructor>(1, constructor)));
4099 void LayerFactory::unregisterLayer(const String &type)
4101 CV_TRACE_FUNCTION();
4102 CV_TRACE_ARG_VALUE(type, "type", type.c_str());
4104 cv::AutoLock lock(getLayerFactoryMutex());
4105 String type_ = toLowerCase(type);
4107 LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
4108 if (it != getLayerFactoryImpl().end())
4110 if (it->second.size() > 1)
4111 it->second.pop_back();
4113 getLayerFactoryImpl().erase(it);
4117 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
4119 CV_TRACE_FUNCTION();
4120 CV_TRACE_ARG_VALUE(type, "type", type.c_str());
4122 cv::AutoLock lock(getLayerFactoryMutex());
4123 String type_ = toLowerCase(type);
4124 LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_);
4126 if (it != getLayerFactoryImpl().end())
4128 CV_Assert(!it->second.empty());
4129 return it->second.back()(params);
4133 return Ptr<Layer>(); //NULL
4137 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
4139 BackendNode::~BackendNode() {};
4141 BackendWrapper::BackendWrapper(int backendId, int targetId)
4142 : backendId(backendId), targetId(targetId) {}
4144 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
4146 CV_Error(Error::StsNotImplemented,
4147 "Constructor of backend wrapper must be implemented");
4150 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
4152 CV_Error(Error::StsNotImplemented,
4153 "Constructor of backend wrapper must be implemented");
4156 BackendWrapper::~BackendWrapper() {}
4158 Net readNet(const String& _model, const String& _config, const String& _framework)
4160 String framework = toLowerCase(_framework);
4161 String model = _model;
4162 String config = _config;
4163 const std::string modelExt = model.substr(model.rfind('.') + 1);
4164 const std::string configExt = config.substr(config.rfind('.') + 1);
4165 if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
4166 modelExt == "prototxt" || configExt == "prototxt")
4168 if (modelExt == "prototxt" || configExt == "caffemodel")
4169 std::swap(model, config);
4170 return readNetFromCaffe(config, model);
4172 if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
4173 modelExt == "pbtxt" || configExt == "pbtxt")
4175 if (modelExt == "pbtxt" || configExt == "pb")
4176 std::swap(model, config);
4177 return readNetFromTensorflow(model, config);
4179 if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
4180 configExt == "t7" || configExt == "net")
4182 return readNetFromTorch(model.empty() ? config : model);
4184 if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
4185 modelExt == "cfg" || configExt == "cfg")
4187 if (modelExt == "cfg" || configExt == "weights")
4188 std::swap(model, config);
4189 return readNetFromDarknet(config, model);
4191 if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
4192 modelExt == "xml" || configExt == "xml")
4194 if (modelExt == "xml" || configExt == "bin")
4195 std::swap(model, config);
4196 return readNetFromModelOptimizer(config, model);
4198 if (framework == "onnx" || modelExt == "onnx")
4200 return readNetFromONNX(model);
4202 CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
4203 model + (config.empty() ? "" : ", " + config));
4206 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
4207 const std::vector<uchar>& bufferConfig)
4209 String framework = toLowerCase(_framework);
4210 if (framework == "caffe")
4211 return readNetFromCaffe(bufferConfig, bufferModel);
4212 else if (framework == "tensorflow")
4213 return readNetFromTensorflow(bufferModel, bufferConfig);
4214 else if (framework == "darknet")
4215 return readNetFromDarknet(bufferConfig, bufferModel);
4216 else if (framework == "torch")
4217 CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
4218 else if (framework == "dldt")
4219 CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
4220 CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
4223 Net readNetFromModelOptimizer(const String &xml, const String &bin)
4225 return Net::readFromModelOptimizer(xml, bin);
4228 CV__DNN_INLINE_NS_END