modules/dnn/src/dnn.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Third party copyrights are property of their respective owners.
  15 //
  16 // Redistribution and use in source and binary forms, with or without modification,
  17 // are permitted provided that the following conditions are met:
  18 //
  19 //   * Redistribution's of source code must retain the above copyright notice,
  20 //     this list of conditions and the following disclaimer.
  21 //
  22 //   * Redistribution's in binary form must reproduce the above copyright notice,
  23 //     this list of conditions and the following disclaimer in the documentation
  24 //     and/or other materials provided with the distribution.
  25 //
  26 //   * The name of the copyright holders may not be used to endorse or promote products
  27 //     derived from this software without specific prior written permission.
  28 //
  29 // This software is provided by the copyright holders and contributors "as is" and
  30 // any express or implied warranties, including, but not limited to, the implied
  31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  32 // In no event shall the Intel Corporation or contributors be liable for any direct,
  33 // indirect, incidental, special, exemplary, or consequential damages
  34 // (including, but not limited to, procurement of substitute goods or services;
  35 // loss of use, data, or profits; or business interruption) however caused
  36 // and on any theory of liability, whether in contract, strict liability,
  37 // or tort (including negligence or otherwise) arising in any way out of
  38 // the use of this software, even if advised of the possibility of such damage.
  39 //
  40 //M*/
  41
  42 #include "precomp.hpp"
  43 #include "op_halide.hpp"
  44 #include "op_inf_engine.hpp"
  45 #include "op_vkcom.hpp"
  46 #include "halide_scheduler.hpp"
  47 #include <set>
  48 #include <algorithm>
  49 #include <iostream>
  50 #include <sstream>
  51 #include <fstream>
  52 #include <iterator>
  53 #include <numeric>
  54 #include <opencv2/dnn/shape_utils.hpp>
  55 #include <opencv2/imgproc.hpp>
  56
  57 #include <opencv2/core/utils/configuration.private.hpp>
  58 #include <opencv2/core/utils/logger.hpp>
  59
  60 namespace cv {
  61 namespace dnn {
  62 CV__DNN_INLINE_NS_BEGIN
  63
  64 // this option is useful to run valgrind memory errors detection
  65 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
  66
  67 #ifdef HAVE_OPENCL
  68 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
  69 #endif
  70
  71 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
  72 #ifdef HAVE_INF_ENGINE
  73     (size_t)DNN_BACKEND_INFERENCE_ENGINE
  74 #else
  75     (size_t)DNN_BACKEND_OPENCV
  76 #endif
  77 );
  78
  79 // Additional checks (slowdowns execution!)
  80 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
  81 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
  82 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
  83
  84 using std::vector;
  85 using std::map;
  86 using std::make_pair;
  87 using std::set;
  88
  89 //==================================================================================================
  90
  91 class BackendRegistry
  92 {
  93 public:
  94     typedef std::vector< std::pair<Backend, Target> > BackendsList;
  95     const BackendsList & getBackends() const { return backends; }
  96     static BackendRegistry & getRegistry()
  97     {
  98         static BackendRegistry impl;
  99         return impl;
 100     }
 101 private:
 102     BackendRegistry()
 103     {
 104 #ifdef HAVE_HALIDE
 105         backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_CPU));
 106 #  ifdef HAVE_OPENCL
 107         if (cv::ocl::useOpenCL())
 108             backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL));
 109 #  endif
 110 #endif // HAVE_HALIDE
 111
 112 #ifdef HAVE_INF_ENGINE
 113         if (checkIETarget(DNN_TARGET_CPU))
 114             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU));
 115         if (checkIETarget(DNN_TARGET_MYRIAD))
 116             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
 117         if (checkIETarget(DNN_TARGET_FPGA))
 118             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_FPGA));
 119 #  ifdef HAVE_OPENCL
 120         if (cv::ocl::useOpenCL() && ocl::Device::getDefault().isIntel())
 121         {
 122             if (checkIETarget(DNN_TARGET_OPENCL))
 123                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL));
 124             if (checkIETarget(DNN_TARGET_OPENCL_FP16))
 125                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
 126         }
 127 #  endif
 128 #endif // HAVE_INF_ENGINE
 129
 130 #ifdef HAVE_OPENCL
 131         if (cv::ocl::useOpenCL())
 132         {
 133             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL));
 134             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16));
 135         }
 136 #endif
 137
 138         backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
 139
 140 #ifdef HAVE_VULKAN
 141         if (haveVulkan())
 142             backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
 143 #endif
 144     }
 145     static inline bool checkIETarget(int target)
 146     {
 147 #ifndef HAVE_INF_ENGINE
 148         return false;
 149 #else
 150         cv::dnn::Net net;
 151         cv::dnn::LayerParams lp;
 152         lp.set("kernel_size", 1);
 153         lp.set("num_output", 1);
 154         lp.set("bias_term", false);
 155         lp.type = "Convolution";
 156         lp.name = "testLayer";
 157         lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
 158         net.addLayerToPrev(lp.name, lp.type, lp);
 159         net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
 160         net.setPreferableTarget(target);
 161         static int inpDims[] = {1, 2, 3, 4};
 162         net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
 163         try
 164         {
 165             net.forward();
 166         }
 167         catch(...)
 168         {
 169             return false;
 170         }
 171         return true;
 172 #endif
 173     }
 174
 175     BackendsList backends;
 176 };
 177
 178
 179 std::vector< std::pair<Backend, Target> > getAvailableBackends()
 180 {
 181     return BackendRegistry::getRegistry().getBackends();
 182 }
 183
 184 std::vector<Target> getAvailableTargets(Backend be)
 185 {
 186     if (be == DNN_BACKEND_DEFAULT)
 187         be = (Backend)PARAM_DNN_BACKEND_DEFAULT;
 188
 189     std::vector<Target> result;
 190     const BackendRegistry::BackendsList all_backends = getAvailableBackends();
 191     for(BackendRegistry::BackendsList::const_iterator i = all_backends.begin(); i != all_backends.end(); ++i )
 192     {
 193         if (i->first == be)
 194             result.push_back(i->second);
 195     }
 196     return result;
 197 }
 198
 199 //==================================================================================================
 200
 201 namespace
 202 {
 203     typedef std::vector<MatShape> ShapesVec;
 204
 205     struct LayerShapes
 206     {
 207         ShapesVec in, out, internal;
 208         // No guarantees that layer which support in-place computations
 209         // will be computed in-place (input.data_ptr == output.data_ptr).
 210         // If layer said that it could work in-place and layers after it
 211         // no longer use input blob, we'll set output = input.
 212         bool supportInPlace;
 213         LayerShapes() {supportInPlace = false;}
 214     };
 215 }
 216
 217 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
 218                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
 219 {
 220     CV_TRACE_FUNCTION();
 221     Mat blob;
 222     blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 223     return blob;
 224 }
 225
 226 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
 227                    const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
 228 {
 229     CV_TRACE_FUNCTION();
 230     std::vector<Mat> images(1, image.getMat());
 231     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 232 }
 233
 234 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
 235                    const Scalar& mean, bool swapRB, bool crop, int ddepth)
 236 {
 237     CV_TRACE_FUNCTION();
 238     Mat blob;
 239     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 240     return blob;
 241 }
 242
 243 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
 244                     Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
 245 {
 246     CV_TRACE_FUNCTION();
 247     CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
 248     if (ddepth == CV_8U)
 249     {
 250         CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
 251         CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
 252     }
 253
 254     std::vector<Mat> images;
 255     images_.getMatVector(images);
 256     CV_Assert(!images.empty());
 257     for (size_t i = 0; i < images.size(); i++)
 258     {
 259         Size imgSize = images[i].size();
 260         if (size == Size())
 261             size = imgSize;
 262         if (size != imgSize)
 263         {
 264             if(crop)
 265             {
 266               float resizeFactor = std::max(size.width / (float)imgSize.width,
 267                                             size.height / (float)imgSize.height);
 268               resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
 269               Rect crop(Point(0.5 * (images[i].cols - size.width),
 270                               0.5 * (images[i].rows - size.height)),
 271                         size);
 272               images[i] = images[i](crop);
 273             }
 274             else
 275               resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
 276         }
 277         if(images[i].depth() == CV_8U && ddepth == CV_32F)
 278             images[i].convertTo(images[i], CV_32F);
 279         Scalar mean = mean_;
 280         if (swapRB)
 281             std::swap(mean[0], mean[2]);
 282
 283         images[i] -= mean;
 284         images[i] *= scalefactor;
 285     }
 286
 287     size_t nimages = images.size();
 288     Mat image0 = images[0];
 289     int nch = image0.channels();
 290     CV_Assert(image0.dims == 2);
 291     if (nch == 3 || nch == 4)
 292     {
 293         int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
 294         blob_.create(4, sz, ddepth);
 295         Mat blob = blob_.getMat();
 296         Mat ch[4];
 297
 298         for(size_t i = 0; i < nimages; i++ )
 299         {
 300             const Mat& image = images[i];
 301             CV_Assert(image.depth() == blob_.depth());
 302             nch = image.channels();
 303             CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
 304             CV_Assert(image.size() == image0.size());
 305
 306             for( int j = 0; j < nch; j++ )
 307                 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
 308             if(swapRB)
 309                 std::swap(ch[0], ch[2]);
 310             split(image, ch);
 311         }
 312     }
 313     else
 314     {
 315        CV_Assert(nch == 1);
 316        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
 317        blob_.create(4, sz, ddepth);
 318        Mat blob = blob_.getMat();
 319
 320        for(size_t i = 0; i < nimages; i++ )
 321        {
 322            const Mat& image = images[i];
 323            CV_Assert(image.depth() == blob_.depth());
 324            nch = image.channels();
 325            CV_Assert(image.dims == 2 && (nch == 1));
 326            CV_Assert(image.size() == image0.size());
 327
 328            image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
 329        }
 330     }
 331 }
 332
 333 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
 334 {
 335     CV_TRACE_FUNCTION();
 336
 337     //A blob is a 4 dimensional matrix in floating point precision
 338     //blob_[0] = batchSize = nbOfImages
 339     //blob_[1] = nbOfChannels
 340     //blob_[2] = height
 341     //blob_[3] = width
 342     CV_Assert(blob_.depth() == CV_32F);
 343     CV_Assert(blob_.dims == 4);
 344
 345     images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
 346
 347     std::vector<Mat> vectorOfChannels(blob_.size[1]);
 348     for (int n = 0; n <  blob_.size[0]; ++n)
 349     {
 350         for (int c = 0; c < blob_.size[1]; ++c)
 351         {
 352             vectorOfChannels[c] = getPlane(blob_, n, c);
 353         }
 354         cv::merge(vectorOfChannels, images_.getMatRef(n));
 355     }
 356 }
 357
 358 class OpenCLBackendWrapper : public BackendWrapper
 359 {
 360 public:
 361     OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
 362     {
 363         m.copyTo(umat);
 364         host = &m;
 365         hostDirty = false;
 366     }
 367
 368     OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
 369         : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
 370     {
 371         Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
 372         CV_Assert(!base.empty());
 373
 374         host = &m;
 375
 376         int shape[] = {1, (int)base->umat.total()};
 377         umat = base->umat.reshape(1, 2, &shape[0])
 378                          .colRange(0, host->total())
 379                          .reshape(1, host->dims, &host->size[0]);
 380         hostDirty = false;
 381     }
 382
 383     static Ptr<BackendWrapper> create(Mat& m)
 384     {
 385         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
 386     }
 387
 388     static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
 389     {
 390         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
 391     }
 392
 393     static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
 394     {
 395         const int numWrappers = wrappers.size();
 396         std::vector<UMat> mats(wrappers.size());
 397         for (int i = 0; i < numWrappers; ++i)
 398         {
 399             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
 400             CV_Assert(!umatWrapper.empty());
 401             umatWrapper->copyToDevice();
 402             mats[i] = umatWrapper->umat;
 403         }
 404         return mats;
 405     }
 406
 407     // Replaces all umats in wrappers to specific ones.
 408     static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
 409                        const std::vector<UMat>& umats)
 410     {
 411         CV_Assert(wrappers.size() == umats.size());
 412         for (int i = 0, n = umats.size(); i < n; ++i)
 413         {
 414             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
 415             CV_Assert(!umatWrapper.empty());
 416             umatWrapper->umat = umats[i];
 417         }
 418     }
 419
 420     ~OpenCLBackendWrapper() {}
 421
 422     // Copies data from device to a host memory.
 423     virtual void copyToHost() CV_OVERRIDE
 424     {
 425         umat.copyTo(*host);
 426     }
 427
 428     virtual void setHostDirty() CV_OVERRIDE
 429     {
 430         hostDirty = true;
 431     };
 432
 433     void copyToDevice()
 434     {
 435         if (hostDirty)
 436         {
 437             host->copyTo(umat);
 438             hostDirty = false;
 439         }
 440     }
 441
 442 private:
 443     UMat umat;
 444     Mat* host;
 445     bool hostDirty;
 446 };
 447
 448 struct LayerPin
 449 {
 450     int lid;
 451     int oid;
 452
 453     LayerPin(int layerId = -1, int outputId = -1)
 454         : lid(layerId), oid(outputId) {}
 455
 456     bool valid() const
 457     {
 458         return (lid >= 0 && oid >= 0);
 459     }
 460
 461     bool equal(const LayerPin &r) const
 462     {
 463         return (lid == r.lid && oid == r.oid);
 464     }
 465
 466     bool operator<(const LayerPin &r) const
 467     {
 468         return lid < r.lid || (lid == r.lid && oid < r.oid);
 469     }
 470
 471     bool operator ==(const LayerPin &r) const
 472     {
 473         return lid == r.lid && oid == r.oid;
 474     }
 475 };
 476
 477 struct LayerData
 478 {
 479     LayerData() : id(-1), skip(false), flag(0) {}
 480     LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
 481         : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
 482     {
 483         CV_TRACE_FUNCTION();
 484
 485         //add logging info
 486         params.name = name;
 487         params.type = type;
 488     }
 489
 490     int id;
 491     String name;
 492     String type;
 493     LayerParams params;
 494
 495     std::vector<LayerPin> inputBlobsId;
 496     std::set<int> inputLayersId;
 497     std::set<int> requiredOutputs;
 498     std::vector<LayerPin> consumers;
 499     std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
 500     std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
 501     std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
 502
 503     Ptr<Layer> layerInstance;
 504     std::vector<Mat> outputBlobs;
 505     std::vector<Mat*> inputBlobs;
 506     std::vector<Mat> internals;
 507     // Computation nodes of implemented backends (except DEFAULT).
 508     std::map<int, Ptr<BackendNode> > backendNodes;
 509     // Flag for skip layer computation for specific backend.
 510     bool skip;
 511
 512     int flag;
 513
 514     Ptr<Layer> getLayerInstance()
 515     {
 516         CV_TRACE_FUNCTION();
 517         CV_TRACE_ARG_VALUE(type, "type", type.c_str());
 518
 519         if (layerInstance)
 520             return layerInstance;
 521
 522         layerInstance = LayerFactory::createLayerInstance(type, params);
 523         if (!layerInstance)
 524         {
 525             CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
 526         }
 527
 528         return layerInstance;
 529     }
 530 };
 531
 532 //fake layer containing network input blobs
 533 struct DataLayer : public Layer
 534 {
 535     DataLayer() : Layer()
 536     {
 537         skip = false;
 538     }
 539
 540     virtual bool supportBackend(int backendId) CV_OVERRIDE
 541     {
 542         return backendId == DNN_BACKEND_OPENCV ||
 543                (backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1);
 544     }
 545
 546     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 547     {
 548         CV_TRACE_FUNCTION();
 549         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 550
 551         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 552                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 553
 554         if (outputs_arr.depth() == CV_16S)
 555         {
 556             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 557             return;
 558         }
 559
 560         std::vector<Mat> outputs, internals;
 561         outputs_arr.getMatVector(outputs);
 562         internals_arr.getMatVector(internals);
 563
 564         // Supported modes:
 565         // | Input type | Output type |
 566         // |       fp32 |        fp32 |
 567         // |      uint8 |        fp32 |
 568         for (int i = 0; i < inputsData.size(); ++i)
 569         {
 570             double scale = scaleFactors[i];
 571             Scalar& mean = means[i];
 572             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
 573             CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
 574
 575             bool singleMean = true;
 576             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
 577             {
 578                 singleMean = mean[j] == mean[j - 1];
 579             }
 580
 581             if (singleMean)
 582             {
 583                 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
 584             }
 585             else
 586             {
 587                 for (int n = 0; n < inputsData[i].size[0]; ++n)
 588                     for (int c = 0; c < inputsData[i].size[1]; ++c)
 589                     {
 590                         Mat inp = getPlane(inputsData[i], n, c);
 591                         Mat out = getPlane(outputs[i], n, c);
 592                         inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
 593                     }
 594             }
 595         }
 596     }
 597
 598 #ifdef HAVE_OPENCL
 599     std::vector<Mat> tmp_expressions;
 600     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
 601     {
 602         // Supported modes:
 603         // | Input type | Output type |
 604         // |       fp32 |        fp32 |
 605         // |       fp32 |        fp16 |
 606         // |      uint8 |        fp32 |
 607         std::vector<UMat> outputs;
 608         outputs_.getUMatVector(outputs);
 609
 610         tmp_expressions.clear();
 611         for (int i = 0; i < inputsData.size(); ++i)
 612         {
 613             Mat inputData = inputsData[i];
 614
 615             double scale = scaleFactors[i];
 616             Scalar& mean = means[i];
 617
 618             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
 619             bool singleMean = true;
 620             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
 621             {
 622                 singleMean = mean[j] == mean[j - 1];
 623             }
 624
 625             if (outputs_.depth() == CV_16S)
 626             {
 627                 if (singleMean)
 628                 {
 629                     tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
 630                     convertFp16(tmp_expressions.back(), outputs[i]);
 631                 }
 632                 else
 633                 {
 634                     for (int n = 0; n < inputsData[i].size[0]; ++n)
 635                         for (int c = 0; c < inputsData[i].size[1]; ++c)
 636                         {
 637                             Mat inp = getPlane(inputsData[i], n, c);
 638
 639                             std::vector<cv::Range> plane(4, Range::all());
 640                             plane[0] = Range(n, n + 1);
 641                             plane[1] = Range(c, c + 1);
 642                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
 643
 644                             tmp_expressions.push_back(scale * (inp - mean[c]));
 645                             convertFp16(tmp_expressions.back(), out);
 646                         }
 647                 }
 648             }
 649             else
 650             {
 651                 CV_Assert(outputs_.depth() == CV_32F);
 652                 if (singleMean)
 653                 {
 654                     inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
 655                 }
 656                 else
 657                 {
 658                     for (int n = 0; n < inputsData[i].size[0]; ++n)
 659                         for (int c = 0; c < inputsData[i].size[1]; ++c)
 660                         {
 661                             Mat inp = getPlane(inputsData[i], n, c);
 662
 663                             std::vector<cv::Range> plane(4, Range::all());
 664                             plane[0] = Range(n, n + 1);
 665                             plane[1] = Range(c, c + 1);
 666                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
 667
 668                             inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
 669                         }
 670                 }
 671             }
 672         }
 673         return true;
 674     }
 675 #endif
 676
 677     int outputNameToIndex(const String& tgtName) CV_OVERRIDE
 678     {
 679         int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
 680         return (idx < (int)outNames.size()) ? idx : -1;
 681     }
 682
 683     void setNames(const std::vector<String> &names)
 684     {
 685         outNames.assign(names.begin(), names.end());
 686     }
 687
 688     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 689                          const int requiredOutputs,
 690                          std::vector<MatShape> &outputs,
 691                          std::vector<MatShape> &internals) const CV_OVERRIDE
 692     {
 693         CV_Assert(inputs.size() == requiredOutputs);
 694         outputs.assign(inputs.begin(), inputs.end());
 695         return false;
 696     }
 697
 698     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
 699     {
 700         std::vector<Mat> outputs;
 701         outputs_arr.getMatVector(outputs);
 702
 703         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
 704                   inputsData.size() == outputs.size());
 705         skip = true;
 706         for (int i = 0; skip && i < inputsData.size(); ++i)
 707         {
 708             if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
 709                 skip = false;
 710         }
 711     }
 712
 713     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
 714     {
 715 #ifdef HAVE_INF_ENGINE
 716         CV_CheckEQ(inputsData.size(), (size_t)1, "");
 717         CV_CheckEQ(inputsData[0].dims, 4, "");
 718         const size_t numChannels = inputsData[0].size[1];
 719         CV_Assert(numChannels <= 4);
 720
 721         // Scale
 722         InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
 723                                        InferenceEngine::Layout::C);
 724         auto weights = InferenceEngine::make_shared_blob<float>(td);
 725         weights->allocate();
 726
 727         float* weight_buf = weights->buffer().as<float*>();
 728         std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
 729
 730         // Mean subtraction
 731         auto biases = InferenceEngine::make_shared_blob<float>(td);
 732         biases->allocate();
 733         float* bias_buf = biases->buffer().as<float*>();
 734
 735         for (int i = 0; i < numChannels; ++i)
 736         {
 737             bias_buf[i] = -means[0][i] * scaleFactors[0];
 738         }
 739
 740         InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
 741         addConstantData("weights", weights, ieLayer);
 742         addConstantData("biases", biases, ieLayer);
 743         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 744 #endif  // HAVE_INF_ENGINE
 745         return Ptr<BackendNode>();
 746     }
 747
 748     std::vector<String> outNames;
 749     // Preprocessing parameters for each network's input.
 750     std::vector<double> scaleFactors;
 751     std::vector<Scalar> means;
 752     std::vector<Mat> inputsData;
 753     bool skip;
 754 };
 755
 756 struct BlobManager
 757 {
 758 public:
 759     // Increase references counter to layer output.
 760     void addReference(const LayerPin& lp)
 761     {
 762         std::map<LayerPin, int>::iterator it = refCounter.find(lp);
 763         if (it == refCounter.end())
 764             refCounter[lp] = 1;
 765         else
 766             it->second += 1;
 767     }
 768
 769     void addReferences(const std::vector<LayerPin>& pins)
 770     {
 771         for (int i = 0; i < pins.size(); i++)
 772         {
 773             addReference(pins[i]);
 774         }
 775     }
 776
 777     // Returns number of references to allocated memory that used in specific
 778     // layer blob.
 779     int numReferences(const LayerPin& lp)
 780     {
 781         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
 782         CV_Assert(mapIt != reuseMap.end());
 783         LayerPin memHost = mapIt->second;
 784
 785         std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
 786         CV_Assert(refIt != refCounter.end());
 787         return refIt->second;
 788     }
 789
 790     // Reuse data allocated in <host> inside the <user> blob.
 791     void reuse(const LayerPin& host, const LayerPin& user)
 792     {
 793         CV_Assert(reuseMap.find(user) == reuseMap.end());
 794         CV_Assert(reuseMap.find(host) != reuseMap.end());
 795         LayerPin memHost = reuseMap[host];
 796         reuseMap[user] = memHost;
 797         if (refCounter.find(memHost) != refCounter.end())
 798         {
 799             std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
 800             if (userRefIt != refCounter.end())
 801             {
 802                 refCounter[memHost] += userRefIt->second;
 803                 refCounter.erase(userRefIt);
 804             }
 805             else
 806                 refCounter[memHost] += 1;
 807         }
 808     }
 809
 810     // Decrease references counter to allocated memory inside specific blob.
 811     void releaseReference(const LayerPin& lp)
 812     {
 813         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
 814         CV_Assert(mapIt != reuseMap.end());
 815
 816         std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
 817         CV_Assert(refIt != refCounter.end());
 818         CV_Assert(refIt->second > 0);
 819         refIt->second -= 1;
 820     }
 821
 822     void releaseReferences(const std::vector<LayerPin>& pins)
 823     {
 824         for (int i = 0; i < pins.size(); i++)
 825         {
 826             releaseReference(pins[i]);
 827         }
 828     }
 829
 830     void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
 831     {
 832         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
 833         {
 834             Mat bestBlob;
 835             LayerPin bestBlobPin;
 836
 837             std::map<LayerPin, Mat>::iterator hostIt;
 838             std::map<LayerPin, int>::iterator refIt;
 839
 840             const int targetTotal = total(shape);
 841             int bestBlobTotal = INT_MAX;
 842
 843             for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
 844             {
 845                 refIt = refCounter.find(hostIt->first);
 846                 // Use only blobs that had references before because if not,
 847                 // it might be used as output.
 848                 if (refIt != refCounter.end() && refIt->second == 0)
 849                 {
 850                     Mat& unusedBlob = hostIt->second;
 851                     if (unusedBlob.total() >= targetTotal &&
 852                         unusedBlob.total() < bestBlobTotal)
 853                     {
 854                         bestBlobPin = hostIt->first;
 855                         bestBlob = unusedBlob;
 856                         bestBlobTotal = unusedBlob.total();
 857                     }
 858                 }
 859             }
 860             if (!bestBlob.empty())
 861             {
 862                 reuse(bestBlobPin, lp);
 863                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
 864                 return;
 865             }
 866         }
 867
 868         {
 869             // if dst already has been allocated with total(shape) elements,
 870             // it won't be recreated and pointer of dst.data remains the same.
 871             dst.create(shape, use_half ? CV_16S : CV_32F);
 872             addHost(lp, dst);
 873         }
 874     }
 875
 876     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
 877                                std::vector<LayerPin>& pinsForInternalBlobs,
 878                                bool use_half = false)
 879     {
 880         CV_TRACE_FUNCTION();
 881
 882         pinsForInternalBlobs.clear();
 883
 884         std::vector<Mat>& outputBlobs = ld.outputBlobs,
 885                 &internalBlobs = ld.internals;
 886
 887         const ShapesVec& outShapes = layerShapes.out,
 888                 internalShapes = layerShapes.internal;
 889
 890         outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
 891         internalBlobs.resize(internalShapes.size());
 892
 893         CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
 894
 895         // Check that layer could work in-place.
 896         bool inPlace = false;
 897         if (layerShapes.supportInPlace)
 898         {
 899             if (ld.inputBlobs.size() == 1)
 900             {
 901                 // Get number of references to the input memory.
 902                 int numRef = numReferences(ld.inputBlobsId[0]);
 903                 // If current layer is one and only customer of this blob.
 904                 inPlace = numRef == 1;
 905             }
 906         }
 907
 908         ShapesVec shapes(outShapes);
 909         shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
 910         std::vector<Mat*> blobs;
 911         for(int i = 0; i < outputBlobs.size(); i++)
 912         {
 913             blobs.push_back(&outputBlobs[i]);
 914         }
 915
 916         for(int i = 0; i < internalBlobs.size(); i++)
 917         {
 918             blobs.push_back(&internalBlobs[i]);
 919             if (total(internalShapes[i]))
 920             {
 921                 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
 922             }
 923         }
 924
 925         addReferences(pinsForInternalBlobs);
 926
 927         std::map<int, std::vector<int> > idxSizes;
 928         for(int i = 0; i < shapes.size(); i++)
 929         {
 930             idxSizes[total(shapes[i])].push_back(i);
 931         }
 932
 933         std::map<int, std::vector<int> >::reverse_iterator it;
 934         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
 935         {
 936             for(int j = 0; j < it->second.size(); j++)
 937             {
 938                 int index = it->second[j];
 939                 if (total(shapes[index]))
 940                 {
 941                     LayerPin blobPin(ld.id, index);
 942                     if (index < outShapes.size() && inPlace)
 943                     {
 944                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
 945                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
 946                         reuse(ld.inputBlobsId[0], blobPin);
 947                     }
 948                     else
 949                         reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
 950                 }
 951             }
 952         }
 953     }
 954
 955     // Clear internal state. Calls before an every reallocation.
 956     void reset()
 957     {
 958         CV_TRACE_FUNCTION();
 959
 960         refCounter.clear();
 961         reuseMap.clear();
 962         memHosts.clear();
 963     }
 964
 965 private:
 966     // Register allocated memory.
 967     void addHost(const LayerPin& lp, const Mat& mat)
 968     {
 969         CV_Assert(memHosts.find(lp) == memHosts.end());
 970         reuseMap[lp] = lp;
 971         memHosts[lp] = mat;
 972     }
 973
 974     std::map<LayerPin, int> refCounter;
 975     // Maps pin to origin blob (for whom memory was allocated firstly).
 976     // For origin blobs key == value.
 977     std::map<LayerPin, LayerPin> reuseMap;
 978     std::map<LayerPin, Mat> memHosts;
 979 };
 980
 981 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
 982 {
 983     if (backendId == DNN_BACKEND_OPENCV)
 984     {
 985         if (targetId == DNN_TARGET_CPU)
 986             return Ptr<BackendWrapper>();
 987         else if (IS_DNN_OPENCL_TARGET(targetId))
 988             return OpenCLBackendWrapper::create(m);
 989         else
 990             CV_Error(Error::StsNotImplemented, "Unknown target identifier");
 991     }
 992     else if (backendId == DNN_BACKEND_HALIDE)
 993     {
 994         CV_Assert(haveHalide());
 995 #ifdef HAVE_HALIDE
 996         return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
 997 #endif  // HAVE_HALIDE
 998     }
 999     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
1000     {
1001         CV_Assert(haveInfEngine());
1002 #ifdef HAVE_INF_ENGINE
1003         return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
1004 #endif  // HAVE_INF_ENGINE
1005     }
1006     else if (backendId == DNN_BACKEND_VKCOM)
1007     {
1008         CV_Assert(haveVulkan());
1009 #ifdef HAVE_VULKAN
1010         return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
1011 #endif  // HAVE_VULKAN
1012     }
1013     else
1014         CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1015     return Ptr<BackendWrapper>();
1016 }
1017
1018 struct Net::Impl
1019 {
1020     typedef std::map<int, LayerShapes> LayersShapesMap;
1021     typedef std::map<int, LayerData> MapIdToLayerData;
1022
1023     Impl()
1024     {
1025         //allocate fake net input layer
1026         netInputLayer = Ptr<DataLayer>(new DataLayer());
1027         LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
1028         inpl.id = 0;
1029         netInputLayer->name = inpl.name = "_input";
1030         inpl.type = "__NetInputLayer__";
1031         inpl.layerInstance = netInputLayer;
1032         layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
1033
1034         lastLayerId = 0;
1035         netWasAllocated = false;
1036         fusion = true;
1037         isAsync = false;
1038         preferableBackend = DNN_BACKEND_DEFAULT;
1039         preferableTarget = DNN_TARGET_CPU;
1040         skipInfEngineInit = false;
1041     }
1042
1043     Ptr<DataLayer> netInputLayer;
1044     std::vector<LayerPin> blobsToKeep;
1045     MapIdToLayerData layers;
1046     std::map<String, int> layerNameToId;
1047     BlobManager blobManager;
1048     int preferableBackend;
1049     int preferableTarget;
1050     String halideConfigFile;
1051     bool skipInfEngineInit;
1052     // Map host data to backend specific wrapper.
1053     std::map<void*, Ptr<BackendWrapper> > backendWrappers;
1054
1055     int lastLayerId;
1056
1057     bool netWasAllocated;
1058     bool fusion;
1059     bool isAsync;
1060     std::vector<int64> layersTimings;
1061     Mat output_blob;
1062
1063     Ptr<BackendWrapper> wrap(Mat& host)
1064     {
1065         if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
1066             return Ptr<BackendWrapper>();
1067
1068         MatShape shape(host.dims);
1069         for (int i = 0; i < host.dims; ++i)
1070             shape[i] = host.size[i];
1071
1072         void* data = host.data;
1073         if (backendWrappers.find(data) != backendWrappers.end())
1074         {
1075             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
1076             if (preferableBackend == DNN_BACKEND_OPENCV)
1077             {
1078                 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
1079                 return OpenCLBackendWrapper::create(baseBuffer, host);
1080             }
1081             else if (preferableBackend == DNN_BACKEND_HALIDE)
1082             {
1083                 CV_Assert(haveHalide());
1084   #ifdef HAVE_HALIDE
1085                 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
1086   #endif  // HAVE_HALIDE
1087             }
1088             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1089             {
1090                 return wrapMat(preferableBackend, preferableTarget, host);
1091             }
1092             else if (preferableBackend == DNN_BACKEND_VKCOM)
1093             {
1094   #ifdef HAVE_VULKAN
1095                 return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
1096   #endif
1097             }
1098             else
1099                 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1100         }
1101
1102         Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
1103         backendWrappers[data] = wrapper;
1104         return wrapper;
1105     }
1106
1107 #ifdef HAVE_HALIDE
1108     void compileHalide()
1109     {
1110         CV_TRACE_FUNCTION();
1111
1112         CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
1113
1114         HalideScheduler scheduler(halideConfigFile);
1115         std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
1116         for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
1117         {
1118             LayerData &ld = it->second;
1119             Ptr<Layer> layer = ld.layerInstance;
1120             if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
1121             {
1122                 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
1123                 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
1124                 if (!scheduled)
1125                 {
1126                     // Use automatic scheduling provided by layer.
1127                     layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1128                                                 ld.inputBlobs, ld.outputBlobs,
1129                                                 preferableTarget);
1130                 }
1131                 compileList.emplace_back(ld);
1132             }
1133         }
1134         std::atomic<int> progress(0);
1135         auto fn = ([&] () -> void
1136         {
1137             for (;;)
1138             {
1139                 int id = progress.fetch_add(1);
1140                 if ((size_t)id >= compileList.size())
1141                     return;
1142                 const LayerData& ld = compileList[id].get();
1143                 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1144                 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1145             }
1146         });
1147         size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1148         num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1149         std::vector<std::thread> threads(num_threads - 1);
1150         for (auto& t: threads) t = std::thread(fn);
1151         fn(); // process own tasks
1152         for (auto& t: threads) t.join();
1153     }
1154 #endif
1155
1156     void clear()
1157     {
1158         CV_TRACE_FUNCTION();
1159
1160         MapIdToLayerData::iterator it;
1161         for (it = layers.begin(); it != layers.end(); it++)
1162         {
1163             if (it->second.id != 0) {
1164                 it->second.inputBlobs.clear();
1165                 it->second.outputBlobs.clear();
1166                 it->second.internals.clear();
1167             }
1168             it->second.skip = false;
1169             //it->second.consumers.clear();
1170             Ptr<Layer> currLayer = it->second.layerInstance;
1171
1172             if( currLayer.empty() )
1173                 continue;
1174
1175             currLayer->unsetAttached();
1176         }
1177
1178         layersTimings.clear();
1179     }
1180
1181     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1182     {
1183         CV_TRACE_FUNCTION();
1184
1185         if (preferableBackend == DNN_BACKEND_DEFAULT)
1186             preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1187
1188         CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1189                   preferableTarget == DNN_TARGET_CPU ||
1190                   preferableTarget == DNN_TARGET_OPENCL ||
1191                   preferableTarget == DNN_TARGET_OPENCL_FP16);
1192         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1193                   preferableTarget == DNN_TARGET_CPU ||
1194                   preferableTarget == DNN_TARGET_OPENCL);
1195         CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
1196                   preferableTarget == DNN_TARGET_CPU ||
1197                   preferableTarget == DNN_TARGET_OPENCL ||
1198                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1199                   preferableTarget == DNN_TARGET_MYRIAD ||
1200                   preferableTarget == DNN_TARGET_FPGA);
1201         CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
1202                   preferableTarget == DNN_TARGET_VULKAN);
1203         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1204         {
1205             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1206 #ifndef HAVE_OPENCL
1207             {
1208                 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1209                 preferableTarget = DNN_TARGET_CPU;
1210             }
1211 #else
1212             {
1213                 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1214                 {
1215                     // Current implementation is only valid for GPU (#11494)
1216                     if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1217                     {
1218                         CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1219                         preferableTarget = DNN_TARGET_CPU;
1220                     }
1221                     else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1222                     {
1223                         CV_LOG_WARNING(NULL,
1224                             "DNN: OpenCL target with fp16 precision is not supported "
1225                             "with current OpenCL device (tested with Intel GPUs only), "
1226                             "switching to OpenCL with fp32 precision.");
1227                         preferableTarget = DNN_TARGET_OPENCL;
1228                     }
1229                 }
1230             }
1231 #endif
1232             if (preferableBackend == DNN_BACKEND_VKCOM && !haveVulkan())
1233             {
1234                 preferableBackend = DNN_BACKEND_OPENCV;
1235                 preferableTarget = DNN_TARGET_CPU;
1236             }
1237
1238             clear();
1239
1240             allocateLayers(blobsToKeep_);
1241
1242             MapIdToLayerData::iterator it = layers.find(0);
1243             CV_Assert(it != layers.end());
1244             it->second.skip = netInputLayer->skip;
1245
1246             initBackend();
1247
1248             if (!netWasAllocated )
1249             {
1250 #ifdef HAVE_HALIDE
1251                 if (preferableBackend == DNN_BACKEND_HALIDE)
1252                     compileHalide();
1253 #else
1254                 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1255 #endif
1256             }
1257
1258             netWasAllocated = true;
1259             this->blobsToKeep = blobsToKeep_;
1260         }
1261     }
1262
1263     int getLayerId(const String &layerName)
1264     {
1265         std::map<String, int>::iterator it = layerNameToId.find(layerName);
1266         return (it != layerNameToId.end()) ? it->second : -1;
1267     }
1268
1269     int getLayerId(int id)
1270     {
1271         MapIdToLayerData::iterator it = layers.find(id);
1272         return (it != layers.end()) ? id : -1;
1273     }
1274
1275     int getLayerId(DictValue &layerDesc)
1276     {
1277         if (layerDesc.isInt())
1278             return getLayerId(layerDesc.get<int>());
1279         else if (layerDesc.isString())
1280             return getLayerId(layerDesc.get<String>());
1281
1282         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1283         return -1;
1284     }
1285
1286     String getLayerName(int id)
1287     {
1288         MapIdToLayerData::iterator it = layers.find(id);
1289         return (it != layers.end()) ? it->second.name : "(unknown layer)";
1290     }
1291
1292     LayerData& getLayerData(int id)
1293     {
1294         MapIdToLayerData::iterator it = layers.find(id);
1295
1296         if (it == layers.end())
1297             CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1298
1299         return it->second;
1300     }
1301
1302     LayerData& getLayerData(const String &layerName)
1303     {
1304         int id = getLayerId(layerName);
1305
1306         if (id < 0)
1307             CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1308
1309         return getLayerData(id);
1310     }
1311
1312     LayerData& getLayerData(const DictValue &layerDesc)
1313     {
1314         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1315         if (layerDesc.isInt())
1316             return getLayerData(layerDesc.get<int>());
1317         else /*if (layerDesc.isString())*/
1318             return getLayerData(layerDesc.get<String>());
1319     }
1320
1321     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1322     {
1323         if ((int)ld.inputBlobsId.size() <= inNum)
1324         {
1325             ld.inputBlobsId.resize(inNum + 1);
1326         }
1327         else
1328         {
1329             LayerPin storedFrom = ld.inputBlobsId[inNum];
1330             if (storedFrom.valid() && !storedFrom.equal(from))
1331                 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1332                                                  inNum, ld.name.c_str()));
1333         }
1334
1335         ld.inputBlobsId[inNum] = from;
1336     }
1337
1338     int resolvePinOutputName(LayerData &ld, const String &outName)
1339     {
1340         if (outName.empty())
1341             return 0;
1342         return ld.getLayerInstance()->outputNameToIndex(outName);
1343     }
1344
1345     LayerPin getPinByAlias(const String &layerName)
1346     {
1347         LayerPin pin;
1348         pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1349
1350         if (pin.lid >= 0)
1351             pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1352
1353         return pin;
1354     }
1355
1356     std::vector<LayerPin> getLayerOutPins(const String &layerName)
1357     {
1358         int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1359
1360         std::vector<LayerPin> pins;
1361
1362         for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1363         {
1364             pins.push_back(LayerPin(lid, i));
1365         }
1366
1367         return pins;
1368     }
1369
1370     void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1371     {
1372         CV_Assert(outLayerId < inLayerId);
1373         LayerData &ldOut = getLayerData(outLayerId);
1374         LayerData &ldInp = getLayerData(inLayerId);
1375
1376         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1377         ldOut.requiredOutputs.insert(outNum);
1378         ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1379     }
1380
1381     void initBackend()
1382     {
1383         CV_TRACE_FUNCTION();
1384         if (preferableBackend == DNN_BACKEND_OPENCV)
1385             CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1386         else if (preferableBackend == DNN_BACKEND_HALIDE)
1387             initHalideBackend();
1388         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1389             initInfEngineBackend();
1390         else if (preferableBackend == DNN_BACKEND_VKCOM)
1391             initVkComBackend();
1392         else
1393             CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1394     }
1395
1396     void initHalideBackend()
1397     {
1398         CV_TRACE_FUNCTION();
1399         CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1400
1401         // Iterator to current layer.
1402         MapIdToLayerData::iterator it = layers.begin();
1403         // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1404         // it'll be a conv layer.
1405         MapIdToLayerData::iterator baseIt = layers.begin();
1406         for (; it != layers.end(); it++)
1407         {
1408             LayerData &ldTop = it->second;
1409             Ptr<Layer> layerTop = ldTop.layerInstance;
1410             if (!layerTop->supportBackend(preferableBackend))
1411             {
1412                 // Move base iterator to layer that don't support preferable
1413                 // backend to prevent fusion over layer of different backend.
1414                 baseIt = it;
1415                 continue;
1416             }
1417             // Try to do layers fusion.
1418             LayerData &ldBot = baseIt->second;
1419             Ptr<Layer> layerBot = ldBot.layerInstance;
1420             // 1. Check that bottom and top from the same backends.
1421             if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1422             {
1423                 // 2. Check that current layer works in-place.
1424                 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1425                                ldBot.outputBlobs.size() == 1 &&
1426                                ldTop.inputBlobs[0]->data ==
1427                                ldBot.outputBlobs[0].data;
1428                 if (inPlace)
1429                 {
1430                     // 3. Try to attach node.
1431                     CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1432                     Ptr<BackendNode> fusedNode =
1433                         layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1434                     if (!fusedNode.empty())
1435                     {
1436                         ldTop.skip = true;
1437                         ldBot.backendNodes[preferableBackend] = fusedNode;
1438                         ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1439                         continue;
1440                     }
1441                 }
1442             }
1443             // No layers fusion.
1444             ldTop.skip = false;
1445             ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1446                 layerTop->initHalide(ldTop.inputBlobsWrappers);
1447             baseIt = it;
1448         }
1449     }
1450
1451 #ifdef HAVE_INF_ENGINE
1452     // Before launching Inference Engine graph we need to specify output blobs.
1453     // This function requests output blobs based on inputs references of
1454     // layers from default backend or layers from different graphs.
1455     void addInfEngineNetOutputs(LayerData &ld)
1456     {
1457         Ptr<InfEngineBackendNet> layerNet;
1458         if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1459         {
1460             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1461             if (!node.empty())
1462             {
1463                 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1464                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1465                 layerNet = ieNode->net;
1466             }
1467         }
1468         // For an every input reference we check that it belongs to one of
1469         // the Inference Engine backend graphs. Request an output blob if it is.
1470         // Do nothing if layer's input is from the same graph.
1471         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1472         {
1473             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1474             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1475             if (!inpNode.empty())
1476             {
1477                 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1478                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1479                 if (layerNet != ieInpNode->net)
1480                 {
1481                     // layerNet is empty or nodes are from different graphs.
1482                     ieInpNode->net->addOutput(ieInpNode->layer.getName());
1483                 }
1484             }
1485         }
1486     }
1487 #endif  // HAVE_INF_ENGINE
1488
1489     void initVkComBackend()
1490     {
1491         CV_TRACE_FUNCTION();
1492         CV_Assert(preferableBackend == DNN_BACKEND_VKCOM);
1493 #ifdef HAVE_VULKAN
1494         if (!haveVulkan())
1495             return;
1496
1497         MapIdToLayerData::iterator it = layers.begin();
1498         for (; it != layers.end(); it++)
1499         {
1500             LayerData &ld = it->second;
1501             Ptr<Layer> layer = ld.layerInstance;
1502             if (!layer->supportBackend(preferableBackend))
1503             {
1504                 continue;
1505             }
1506
1507             ld.skip = false;
1508
1509             try
1510             {
1511                 ld.backendNodes[DNN_BACKEND_VKCOM] =
1512                     layer->initVkCom(ld.inputBlobsWrappers);
1513             }
1514             catch (const cv::Exception& e)
1515             {
1516                 CV_LOG_ERROR(NULL, "initVkCom failed, fallback to CPU implementation. " << e.what());
1517                 ld.backendNodes[DNN_BACKEND_VKCOM] = Ptr<BackendNode>();
1518             }
1519         }
1520 #endif
1521     }
1522
1523     void initInfEngineBackend()
1524     {
1525         CV_TRACE_FUNCTION();
1526         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE, haveInfEngine());
1527 #ifdef HAVE_INF_ENGINE
1528         MapIdToLayerData::iterator it;
1529         Ptr<InfEngineBackendNet> net;
1530
1531         for (it = layers.begin(); it != layers.end(); ++it)
1532         {
1533             LayerData &ld = it->second;
1534             if (ld.id == 0)
1535             {
1536                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1537                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1538                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1539                 {
1540                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1541 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1542                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1543 #else
1544                     dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
1545 #endif
1546                 }
1547             }
1548             else
1549             {
1550                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1551                 {
1552                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1553 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1554                     dataPtr->name = ld.name;
1555 #else
1556                     dataPtr->setName(ld.name);
1557 #endif
1558                 }
1559             }
1560         }
1561
1562         if (skipInfEngineInit)
1563         {
1564             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1565             CV_Assert(!node.empty());
1566
1567             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1568             CV_Assert(!ieNode.empty());
1569
1570             for (it = layers.begin(); it != layers.end(); ++it)
1571             {
1572                 LayerData &ld = it->second;
1573                 if (ld.id == 0)
1574                 {
1575                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1576                     {
1577                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1578 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1579                         dataPtr->name = netInputLayer->outNames[i];
1580 #else
1581                         dataPtr->setName(netInputLayer->outNames[i]);
1582 #endif
1583                     }
1584                 }
1585                 else
1586                 {
1587                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1588                     {
1589                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1590 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1591                         dataPtr->name = ld.name;
1592 #else
1593                         dataPtr->setName(ld.name);
1594 #endif
1595                     }
1596                 }
1597                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1598                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1599                 ld.skip = true;
1600             }
1601             layers[lastLayerId].skip = false;
1602             ieNode->net->init(preferableTarget);
1603             return;
1604         }
1605
1606         // Build Inference Engine networks from sets of layers that support this
1607         // backend. Split a whole model on several Inference Engine networks if
1608         // some of layers are not implemented.
1609
1610         // Set of all input and output blobs wrappers for current network.
1611         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1612         for (it = layers.begin(); it != layers.end(); ++it)
1613         {
1614             LayerData &ld = it->second;
1615             if (ld.id == 0 && ld.skip)
1616                 continue;
1617             bool fused = ld.skip;
1618
1619             Ptr<Layer> layer = ld.layerInstance;
1620             if (!fused && !layer->supportBackend(preferableBackend))
1621             {
1622                 bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
1623                                     INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2);
1624                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
1625                 if (preferableTarget == DNN_TARGET_MYRIAD)
1626                 {
1627                     for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
1628                     {
1629                         customizable = ld.inputBlobs[i]->size[0] == 1;
1630                     }
1631                 }
1632
1633                 // TODO: fix these workarounds
1634                 if (preferableTarget == DNN_TARGET_MYRIAD ||
1635                     preferableTarget == DNN_TARGET_OPENCL ||
1636                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1637                     customizable &= ld.type != "Concat";
1638
1639                 if (preferableTarget == DNN_TARGET_OPENCL ||
1640                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1641                     customizable &= ld.type != "Power";
1642
1643                 if (preferableTarget == DNN_TARGET_OPENCL)
1644                     customizable &= ld.type != "Eltwise";
1645
1646                 if (!customizable)
1647                 {
1648                     addInfEngineNetOutputs(ld);
1649                     net = Ptr<InfEngineBackendNet>();
1650                     netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1651                     layer->preferableTarget = DNN_TARGET_CPU;
1652                     continue;
1653                 }
1654             }
1655             ld.skip = true;  // Initially skip all Inference Engine supported layers.
1656
1657             // Create a new network if one of inputs from different Inference Engine graph.
1658             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1659             {
1660                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1661                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1662                 if (!inpNode.empty())
1663                 {
1664                     Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1665                     CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1666                     if (ieInpNode->net != net)
1667                     {
1668                         net = Ptr<InfEngineBackendNet>();
1669                         netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1670                         break;
1671                     }
1672                 }
1673             }
1674
1675             Ptr<BackendNode> node;
1676             if (!net.empty())
1677             {
1678                 if (fused)
1679                 {
1680                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1681                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1682                     CV_Assert(inPlace);
1683                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1684                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1685                 }
1686             }
1687             else
1688                 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1689
1690             if (!fused)
1691             {
1692                 if (layer->supportBackend(preferableBackend))
1693                     node = layer->initInfEngine(ld.inputBlobsWrappers);
1694                 else
1695                 {
1696                     node = Ptr<BackendNode>(new InfEngineBackendNode(
1697                         ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
1698                 }
1699             }
1700             else if (node.empty())
1701                 continue;
1702
1703             CV_Assert(!node.empty());
1704             ld.backendNodes[preferableBackend] = node;
1705
1706             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1707             CV_Assert(!ieNode.empty());
1708             ieNode->net = net;
1709
1710             // Convert weights in FP16 for specific targets.
1711             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1712                  preferableTarget == DNN_TARGET_MYRIAD ||
1713                  preferableTarget == DNN_TARGET_FPGA) && !fused)
1714             {
1715 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
1716                 for (const std::string& name : {"weights", "biases"})
1717                 {
1718                     auto it = ieNode->layer.getParameters().find(name);
1719                     if (it != ieNode->layer.getParameters().end())
1720                     {
1721                         InferenceEngine::Blob::Ptr bp = it->second.as<InferenceEngine::Blob::Ptr>();
1722                         it->second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(bp));
1723                     }
1724                 }
1725 #else
1726                 auto& blobs = ieNode->layer.getConstantData();
1727                 if (blobs.empty())
1728                 {
1729                     // In case of non weightable layer we have to specify
1730                     // it's precision adding dummy blob.
1731                     auto blob = InferenceEngine::make_shared_blob<int16_t>(
1732                                     InferenceEngine::Precision::FP16,
1733                                     InferenceEngine::Layout::C, {1});
1734                     blob->allocate();
1735                     blobs[""] = blob;
1736                 }
1737                 else
1738                 {
1739                     for (auto& it : blobs)
1740                         it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
1741                 }
1742 #endif
1743             }
1744
1745             if (!fused)
1746                 net->addLayer(ieNode->layer);
1747
1748             net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
1749             net->addBlobs(ld.inputBlobsWrappers);
1750             net->addBlobs(ld.outputBlobsWrappers);
1751             addInfEngineNetOutputs(ld);
1752         }
1753
1754         // Initialize all networks.
1755         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1756         {
1757             LayerData &ld = it->second;
1758             if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1759                 continue;
1760
1761             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1762             if (node.empty())
1763                 continue;
1764
1765             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1766             if (ieNode.empty())
1767                 continue;
1768
1769             CV_Assert(!ieNode->net.empty());
1770
1771             if (!ieNode->net->isInitialized())
1772             {
1773                 ieNode->net->init(preferableTarget);
1774                 ld.skip = false;
1775             }
1776         }
1777 #endif  // HAVE_INF_ENGINE
1778     }
1779
1780     void allocateLayer(int lid, const LayersShapesMap& layersShapes)
1781     {
1782         CV_TRACE_FUNCTION();
1783
1784         LayerData &ld = layers[lid];
1785
1786         //already allocated
1787         if (ld.flag)
1788             return;
1789
1790         size_t ninputs = ld.inputBlobsId.size();
1791 #if 0
1792         printf("layer %s:", ld.name.c_str());
1793         for (size_t i = 0; i < ninputs; i++)
1794         {
1795             int inp_lid = ld.inputBlobsId[i].lid;
1796             LayerData &inp_ld = layers[inp_lid];
1797             int inp_outputs = (int)inp_ld.outputBlobs.size();
1798             std::cout << " " << inp_ld.name << "(" << inp_outputs;
1799
1800             for( int j = 0; j < inp_outputs; j++ )
1801             {
1802                 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
1803             }
1804             std::cout << ")";
1805         }
1806         printf("\n");
1807 #endif
1808
1809         //determine parent layers
1810         for (size_t i = 0; i < ninputs; i++)
1811             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
1812
1813         //allocate parents
1814         for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
1815             allocateLayer(*i, layersShapes);
1816
1817         //bind inputs
1818         if (ld.id == 0)  // DataLayer
1819         {
1820             ninputs = netInputLayer->inputsData.size();
1821             ld.inputBlobsWrappers.resize(ninputs);
1822             for (size_t i = 0; i < ninputs; i++)
1823             {
1824                 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
1825             }
1826         }
1827         else
1828         {
1829             ld.inputBlobs.resize(ninputs);
1830             ld.inputBlobsWrappers.resize(ninputs);
1831             for (size_t i = 0; i < ninputs; i++)
1832             {
1833                 LayerPin from = ld.inputBlobsId[i];
1834                 CV_Assert(from.valid());
1835                 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
1836                 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
1837                 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
1838             }
1839         }
1840
1841         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
1842
1843         CV_Assert(layerShapesIt != layersShapes.end());
1844
1845         std::vector<LayerPin> pinsForInternalBlobs;
1846         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
1847                                           preferableBackend == DNN_BACKEND_OPENCV &&
1848                                           preferableTarget == DNN_TARGET_OPENCL_FP16);
1849         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
1850         for (int i = 0; i < ld.outputBlobs.size(); ++i)
1851         {
1852             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
1853         }
1854         ld.internalBlobsWrappers.resize(ld.internals.size());
1855         for (int i = 0; i < ld.internals.size(); ++i)
1856         {
1857             ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
1858         }
1859
1860         Ptr<Layer> layerPtr = ld.getLayerInstance();
1861         {
1862             std::vector<Mat> inps(ld.inputBlobs.size());
1863             for (int i = 0; i < ld.inputBlobs.size(); ++i)
1864             {
1865                 inps[i] = *ld.inputBlobs[i];
1866             }
1867             layerPtr->finalize(inps, ld.outputBlobs);
1868             layerPtr->preferableTarget = preferableTarget;
1869 #if 0
1870             std::cout << "\toutputs:";
1871             size_t noutputs = ld.outputBlobs.size();
1872             for (size_t j = 0; j < noutputs; j++)
1873             {
1874                 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
1875             }
1876             std::cout << "\n";
1877 #endif
1878         }
1879
1880         // After allocation of layer, we decrease counters to it's input blobs.
1881         blobManager.releaseReferences(ld.inputBlobsId);
1882         blobManager.releaseReferences(pinsForInternalBlobs);
1883
1884         ld.flag = 1;
1885     }
1886
1887 #if 0
1888 #define printf_(args) printf args
1889 #else
1890 #define printf_(args)
1891 #endif
1892
1893     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
1894     {
1895         if( !fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
1896                         preferableBackend != DNN_BACKEND_INFERENCE_ENGINE))
1897             return;
1898
1899         CV_TRACE_FUNCTION();
1900
1901         // scan through all the layers. If there is convolution layer followed by the activation layer,
1902         // we try to embed this activation into the convolution and disable separate execution of the activation
1903         std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
1904                                       blobsToKeep_.end());
1905         MapIdToLayerData::iterator it;
1906         for (it = layers.begin(); it != layers.end(); it++)
1907         {
1908             int lid = it->first;
1909             LayerData& ld = layers[lid];
1910             if( ld.skip )
1911             {
1912                 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1913                 continue;
1914             }
1915             printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1916
1917             // the optimization #1. try to fuse batch norm, scaling and/or activation layers
1918             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
1919             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
1920             // some other layers.
1921             Ptr<Layer>& currLayer = ld.layerInstance;
1922             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
1923             {
1924                 LayerData* nextData = &layers[ld.consumers[0].lid];
1925                 LayerPin lpNext(ld.consumers[0].lid, 0);
1926                 while (nextData)
1927                 {
1928                     Ptr<Layer> nextLayer = nextData->layerInstance;
1929                     if (currLayer->tryFuse(nextLayer))
1930                     {
1931                         printf_(("\tfused with %s\n", nextLayer->name.c_str()));
1932                         nextData->skip = true;
1933                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1934                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1935                         if (nextData->consumers.size() == 1)
1936                         {
1937                             int nextLayerId = nextData->consumers[0].lid;
1938                             nextData = &layers[nextLayerId];
1939                             lpNext = LayerPin(nextLayerId, 0);
1940                         }
1941                         else
1942                         {
1943                             nextData = 0;
1944                             break;
1945                         }
1946                     }
1947                     else
1948                         break;
1949                 }
1950
1951                 if (preferableBackend != DNN_BACKEND_OPENCV)
1952                     continue;  // Go to the next layer.
1953
1954                 // TODO: OpenCL target support more fusion styles.
1955                 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
1956                      (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
1957                      ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
1958                      ld.layerInstance->type != "Concat")) )
1959                     continue;
1960
1961                 while (nextData)
1962                 {
1963                     // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
1964                     if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
1965                         nextData->type != "ReLU" &&
1966                         nextData->type != "ChannelsPReLU" &&
1967                         nextData->type != "ReLU6" &&
1968                         nextData->type != "TanH" &&
1969                         nextData->type != "Power")
1970                         break;
1971
1972                     Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1973                     if (nextActivLayer.empty())
1974                         break;
1975
1976                     if (currLayer->setActivation(nextActivLayer))
1977                     {
1978                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1979                         nextData->skip = true;
1980                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1981                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1982                         if (nextData->consumers.size() == 1)
1983                         {
1984                             int nextLayerId = nextData->consumers[0].lid;
1985                             nextData = &layers[nextLayerId];
1986                             lpNext = LayerPin(nextLayerId, 0);
1987                         }
1988                         else
1989                         {
1990                             nextData = 0;
1991                             break;
1992                         }
1993                     }
1994                     else
1995                         break;
1996                 }
1997
1998                 // fuse convolution layer followed by eltwise + relu
1999                 if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
2000                 {
2001                     Ptr<EltwiseLayer> nextEltwiseLayer;
2002                     if( nextData )
2003                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
2004
2005                     if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2006                         nextData && nextData->inputBlobsId.size() == 2 )
2007                     {
2008                         LayerData *eltwiseData = nextData;
2009
2010                         // Eltwise layer has two inputs. We need to determine which
2011                         // is a base convolution layer and which could be used as it's bias.
2012                         LayerData* biasLayerData = 0;
2013                         for (int i = 0; i < 2; ++i)
2014                         {
2015                             LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
2016                             CV_Assert(downLayerData);
2017                             while (downLayerData->skip)
2018                             {
2019                                 if (downLayerData->inputBlobsId.size() == 1)
2020                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
2021                                 else
2022                                 {
2023                                     downLayerData = 0;
2024                                     break;
2025                                 }
2026                             }
2027                             if (downLayerData && ld.id == downLayerData->id)
2028                             {
2029                                 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
2030                                 break;
2031                             }
2032                         }
2033                         CV_Assert(biasLayerData);
2034                         {
2035                             if( eltwiseData->consumers.size() == 1 )
2036                             {
2037                                 // fuse eltwise + activation layer
2038                                 if (biasLayerData->id < ld.id)
2039                                 {
2040                                     nextData = &layers[eltwiseData->consumers[0].lid];
2041                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
2042                                     Ptr<ActivationLayer> nextActivLayer;
2043                                     if( nextData )
2044                                         nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2045
2046                                     if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2047                                             (!nextData->type.compare("ReLU") ||
2048                                              !nextData->type.compare("ChannelsPReLU") ||
2049                                              !nextData->type.compare("Power")) &&
2050                                             currLayer->setActivation(nextActivLayer) )
2051                                     {
2052                                         CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
2053                                         ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
2054                                         printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
2055                                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2056                                         eltwiseData->skip = true;
2057                                         nextData->skip = true;
2058                                         // This optimization for cases like
2059                                         // some_layer   conv
2060                                         //   |             |
2061                                         //   +-- eltwise --+
2062                                         //          |
2063                                         //        activ
2064                                         // This way all the element-wise computations
2065                                         // (i.e. some_layer+conv or some_layer*conv)
2066                                         // would be done at [conv] layer. So we need to
2067                                         // replace [conv]'s output blob to [eltwise]'s one
2068                                         // considering that [activ] is an in-place layer.
2069                                         // Also we need to move all the consumers' references.
2070                                         // To prevent memory collisions (i.e. when input of
2071                                         // [conv] and output of [eltwise] is the same blob)
2072                                         // we allocate a new blob.
2073                                         CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
2074                                         ld.outputBlobs[0] = ld.outputBlobs[0].clone();
2075                                         ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
2076
2077                                         eltwiseData->outputBlobs = ld.outputBlobs;
2078                                         nextData->outputBlobs = ld.outputBlobs;
2079                                         eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
2080                                         nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
2081
2082                                         // Move references of [activ] layer consumers to the newly allocated blob.
2083                                         for (int i = 0; i < nextData->consumers.size(); ++i)
2084                                         {
2085                                             LayerData& consumer = layers[nextData->consumers[i].lid];
2086                                             for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
2087                                             {
2088                                                 if (consumer.inputBlobsId[j].lid == lpNext.lid)
2089                                                 {
2090                                                     consumer.inputBlobs[j] = &ld.outputBlobs[0];
2091                                                     consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
2092                                                     break;
2093                                                 }
2094                                             }
2095                                         }
2096                                     }
2097                                 }
2098                             }
2099                         }
2100                     }
2101                 }
2102             }
2103
2104             if (preferableBackend != DNN_BACKEND_OPENCV)
2105                 continue;  // Go to the next layer.
2106
2107             // the optimization #2. if there is concat layer that concatenates channels
2108             // from the inputs together (i.e. axis == 1) then we make the inputs of
2109             // the concat layer to write to the concatenation output buffer
2110             // (and so we eliminate the concatenation layer, because the channels
2111             // are concatenated implicitly).
2112             Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
2113             if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
2114                 ld.outputBlobs.size() == 1 )
2115             {
2116                 Mat& output = ld.outputBlobs[0];
2117                 UMat umat_output;
2118                 if (!ld.outputBlobsWrappers.empty() &&
2119                     (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
2120                 {
2121                     size_t i, ninputs = ld.inputBlobsId.size();
2122                     bool conv_layer = true;
2123                     for( i = 0; i < ninputs; i++ )
2124                     {
2125                         LayerPin pin = ld.inputBlobsId[i];
2126                         LayerData* inp_i_data = &layers[pin.lid];
2127                         while(inp_i_data->skip &&
2128                               inp_i_data->inputBlobsId.size() == 1 &&
2129                               inp_i_data->consumers.size() == 1)
2130                         {
2131                             pin = inp_i_data->inputBlobsId[0];
2132                             inp_i_data = &layers[pin.lid];
2133                         }
2134                         conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
2135                     }
2136                     if (!conv_layer)
2137                         continue;
2138                     std::vector<UMat> umat_outputBlobs;
2139                     umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2140                     umat_output = umat_outputBlobs[0];
2141                 }
2142
2143                 // TODO: in general, this optimization can always be done, but
2144                 // many layers currently check that the input/output blobs are
2145                 // continuous arrays. Unfortunately, this is not true when
2146                 // the concatenation optimization is applied with batch_size > 1.
2147                 // so, for now, we only apply this optimization in the most popular
2148                 // case batch_size == 1.
2149                 if( output.dims == 4 && output.size[0] == 1 )
2150                 {
2151                     size_t i, ninputs = ld.inputBlobsId.size();
2152                     std::vector<LayerPin> realinputs(ninputs);
2153                     for( i = 0; i < ninputs; i++ )
2154                     {
2155                         LayerPin pin = ld.inputBlobsId[i];
2156                         LayerData* inp_i_data = &layers[pin.lid];
2157                         while(inp_i_data->skip &&
2158                               inp_i_data->inputBlobsId.size() == 1 &&
2159                               inp_i_data->consumers.size() == 1)
2160                         {
2161                             pin = inp_i_data->inputBlobsId[0];
2162                             inp_i_data = &layers[pin.lid];
2163                         }
2164                         printf_(("\treal input for %s is %s\n",
2165                                layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
2166                                inp_i_data->getLayerInstance()->name.c_str()));
2167
2168                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
2169                             break;
2170                         realinputs[i] = pin;
2171                     }
2172
2173                     if( i >= ninputs )
2174                     {
2175                         // Allocate new memory to prevent collisions during memory
2176                         // reusing (see https://github.com/opencv/opencv/pull/10456).
2177                         output = output.clone();
2178                         if (preferableBackend == DNN_BACKEND_OPENCV &&
2179                             IS_DNN_OPENCL_TARGET(preferableTarget))
2180                         {
2181                             std::vector<UMat> umats(1);
2182                             umat_output = umat_output.clone();
2183                             umats[0] = umat_output;
2184                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
2185                         }
2186                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
2187                         int ofs = 0;
2188                         for( i = 0; i < ninputs; i++ )
2189                         {
2190                             LayerPin pin = realinputs[i];
2191                             LayerData* inp_i_data = &layers[pin.lid];
2192                             int channels_i = ld.inputBlobs[i]->size[1];
2193                             chrange[1] = Range(ofs, ofs + channels_i);
2194                             printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
2195                                    pin.oid, ofs, ofs + channels_i));
2196                             ofs += channels_i;
2197                             Mat output_slice = output(chrange);
2198                             Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
2199                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
2200                             Mat* oldPtr = &curr_output;
2201                             curr_output = output_slice;
2202                             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2203                             {
2204                                 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
2205                                 umats[pin.oid] = umat_output(chrange);
2206                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
2207                             }
2208                             // Layers that refer old input Mat will refer to the
2209                             // new data but the same Mat object.
2210                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
2211                         }
2212                         ld.skip = true;
2213                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
2214                     }
2215                 }
2216             }
2217         }
2218     }
2219
2220     void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
2221     {
2222         CV_TRACE_FUNCTION();
2223
2224         MapIdToLayerData::iterator it;
2225         for (it = layers.begin(); it != layers.end(); it++)
2226             it->second.flag = 0;
2227
2228         CV_Assert(!layers[0].outputBlobs.empty());
2229         ShapesVec inputShapes;
2230         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
2231         {
2232             Mat& inp = layers[0].outputBlobs[i];
2233             CV_Assert(inp.total());
2234             if (preferableBackend == DNN_BACKEND_OPENCV &&
2235                 preferableTarget == DNN_TARGET_OPENCL_FP16)
2236             {
2237                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
2238             }
2239             inputShapes.push_back(shape(inp));
2240         }
2241         LayersShapesMap layersShapes;
2242         getLayersShapes(inputShapes, layersShapes);
2243
2244         blobManager.reset();
2245         backendWrappers.clear();
2246         // Fake references to input blobs.
2247         for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
2248             blobManager.addReference(LayerPin(0, i));
2249         for (it = layers.begin(); it != layers.end(); ++it)
2250         {
2251             const LayerData& ld = it->second;
2252             blobManager.addReferences(ld.inputBlobsId);
2253         }
2254
2255         for (int i = 0; i < blobsToKeep_.size(); i++)
2256         {
2257             blobManager.addReference(blobsToKeep_[i]);
2258         }
2259
2260         for (it = layers.begin(); it != layers.end(); it++)
2261         {
2262             int lid = it->first;
2263             allocateLayer(lid, layersShapes);
2264         }
2265
2266         layersTimings.resize(lastLayerId + 1, 0);
2267         fuseLayers(blobsToKeep_);
2268     }
2269
2270     void forwardLayer(LayerData &ld)
2271     {
2272         CV_TRACE_FUNCTION();
2273
2274         Ptr<Layer> layer = ld.layerInstance;
2275
2276         TickMeter tm;
2277         tm.start();
2278
2279         if( !ld.skip )
2280         {
2281             std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
2282             if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
2283             {
2284                 if (isAsync)
2285                     CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
2286
2287                 if (!layer->supportBackend(DNN_BACKEND_OPENCV))
2288                     CV_Error(Error::StsNotImplemented, format("Layer \"%s\" of type \"%s\" unsupported on OpenCV backend",
2289                                                        ld.name.c_str(), ld.type.c_str()));
2290
2291                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2292                 {
2293                     std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
2294                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2295                     std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
2296                     layer->forward(umat_inputBlobs,
2297                                    umat_outputBlobs,
2298                                    umat_internalBlobs);
2299                     if (DNN_CHECK_NAN_INF)
2300                     {
2301                         bool fail = false;
2302                         for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2303                         {
2304                             UMat& u = umat_outputBlobs[i];
2305                             Mat m;
2306                             if (u.depth() == CV_16S) // FP16
2307                                 convertFp16(u, m);
2308                             else
2309                                 m = u.getMat(ACCESS_READ);
2310                             if (!checkRange(m))
2311                             {
2312                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2313                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2314                                 fail = true;
2315                             }
2316                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2317                             {
2318                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2319                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2320                                 fail = true;
2321                             }
2322                         }
2323                         if (fail)
2324                         {
2325                             for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
2326                             {
2327                                 UMat& u = umat_inputBlobs[i];
2328                                 Mat m;
2329                                 if (u.depth() == CV_16S) // FP16
2330                                     convertFp16(u, m);
2331                                 else
2332                                     m = u.getMat(ACCESS_READ);
2333                                 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2334                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2335                             }
2336                             for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2337                             {
2338                                 UMat& u = umat_outputBlobs[i];
2339                                 Mat m;
2340                                 if (u.depth() == CV_16S) // FP16
2341                                     convertFp16(u, m);
2342                                 else
2343                                     m = u.getMat(ACCESS_READ);
2344                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2345                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2346                             }
2347                             for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
2348                             {
2349                                 UMat& u = umat_internalBlobs[i];
2350                                 Mat m;
2351                                 if (u.depth() == CV_16S) // FP16
2352                                     convertFp16(u, m);
2353                                 else
2354                                     m = u.getMat(ACCESS_READ);
2355                                 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
2356                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
2357                             }
2358                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2359                                 CV_Assert(!fail);
2360                         }
2361                     }
2362                     OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
2363                 }
2364                 else
2365                 {
2366                     for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
2367                     {
2368                         if (!ld.inputBlobsWrappers[i].empty())
2369                             ld.inputBlobsWrappers[i]->copyToHost();
2370                     }
2371
2372                     std::vector<Mat> inps(ld.inputBlobs.size());
2373                     for (int i = 0; i < ld.inputBlobs.size(); ++i)
2374                     {
2375                         inps[i] = *ld.inputBlobs[i];
2376                     }
2377                     layer->forward(inps, ld.outputBlobs, ld.internals);
2378
2379                     if (DNN_CHECK_NAN_INF)
2380                     {
2381                         bool fail = false;
2382                         for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2383                         {
2384                             const Mat& m = ld.outputBlobs[i];
2385                             if (!checkRange(m))
2386                             {
2387                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2388                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2389                                 fail = true;
2390                             }
2391                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2392                             {
2393                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2394                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2395                                 fail = true;
2396                             }
2397                         }
2398                         if (fail)
2399                         {
2400                             for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
2401                             {
2402                                 const Mat* pM = ld.inputBlobs[i];
2403                                 if (!pM)
2404                                 {
2405                                     std::cout << "INPUT " << i << " is NULL" << std::endl;
2406                                     continue;
2407                                 }
2408                                 const Mat& m = *pM;
2409                                 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2410                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2411                             }
2412                             for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2413                             {
2414                                 const Mat& m = ld.outputBlobs[i];
2415                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2416                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2417                             }
2418                             for (size_t i = 0; i < ld.internals.size(); ++i)
2419                             {
2420                                 const Mat& m = ld.internals[i];
2421                                 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2422                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2423                             }
2424                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2425                                 CV_Assert(!fail);
2426                         }
2427                     }
2428
2429                     for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
2430                     {
2431                         if (!ld.outputBlobsWrappers[i].empty())
2432                             ld.outputBlobsWrappers[i]->setHostDirty();
2433                     }
2434                 }
2435             }
2436             else
2437             {
2438                 Ptr<BackendNode> node = it->second;
2439                 CV_Assert(!node.empty());
2440                 if (preferableBackend == DNN_BACKEND_HALIDE)
2441                 {
2442                     forwardHalide(ld.outputBlobsWrappers, node);
2443                 }
2444                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
2445                 {
2446                     forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
2447                 }
2448                 else if (preferableBackend == DNN_BACKEND_VKCOM)
2449                 {
2450                     try
2451                     {
2452                         forwardVkCom(ld.outputBlobsWrappers, node);
2453                     }
2454                     catch (const cv::Exception& e)
2455                     {
2456                         CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what());
2457                         it->second = Ptr<BackendNode>();
2458                         forwardLayer(ld);
2459                     }
2460                 }
2461                 else
2462                 {
2463                     CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
2464                 }
2465             }
2466         }
2467         else
2468             tm.reset();
2469
2470         tm.stop();
2471         layersTimings[ld.id] = tm.getTimeTicks();
2472
2473         ld.flag = 1;
2474     }
2475
2476     void forwardToLayer(LayerData &ld, bool clearFlags = true)
2477     {
2478         CV_TRACE_FUNCTION();
2479
2480         if (clearFlags)
2481         {
2482             MapIdToLayerData::iterator it;
2483             for (it = layers.begin(); it != layers.end(); it++)
2484                 it->second.flag = 0;
2485         }
2486
2487         //already was forwarded
2488         if (ld.flag)
2489             return;
2490
2491         //forward parents
2492         MapIdToLayerData::iterator it;
2493         for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
2494         {
2495             LayerData &ld = it->second;
2496             if (ld.flag)
2497                 continue;
2498             forwardLayer(ld);
2499         }
2500
2501         //forward itself
2502         forwardLayer(ld);
2503     }
2504
2505     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
2506     {
2507         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
2508
2509         if (inOutShapes[0].in[0].empty() && !layers[0].outputBlobs.empty())
2510         {
2511             ShapesVec shapes;
2512             for (int i = 0; i < layers[0].outputBlobs.size(); i++)
2513             {
2514                 Mat& inp = layers[0].outputBlobs[i];
2515                 CV_Assert(inp.total());
2516                 shapes.push_back(shape(inp));
2517             }
2518             inOutShapes[0].in = shapes;
2519          }
2520
2521         if (inOutShapes[id].in.empty())
2522         {
2523             for(int i = 0; i < inputLayerIds.size(); i++)
2524             {
2525                 int layerId = inputLayerIds[i].lid;
2526                 LayersShapesMap::iterator it =
2527                         inOutShapes.find(layerId);
2528                 if(it == inOutShapes.end() ||
2529                         it->second.out.empty())
2530                 {
2531                     getLayerShapesRecursively(layerId, inOutShapes);
2532                 }
2533                 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
2534                 inOutShapes[id].in.push_back(shape);
2535             }
2536         }
2537         const ShapesVec& is = inOutShapes[id].in;
2538         ShapesVec& os = inOutShapes[id].out;
2539         ShapesVec& ints = inOutShapes[id].internal;
2540         int requiredOutputs = layers[id].requiredOutputs.size();
2541         inOutShapes[id].supportInPlace =
2542                 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
2543     }
2544
2545     void getLayersShapes(const ShapesVec& netInputShapes,
2546                          LayersShapesMap& inOutShapes)
2547     {
2548         inOutShapes.clear();
2549
2550         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2551         for (MapIdToLayerData::iterator it = layers.begin();
2552              it != layers.end(); it++)
2553         {
2554             getLayerShapesRecursively(it->first, inOutShapes);
2555         }
2556     }
2557
2558     void getLayerShapes(const ShapesVec& netInputShapes,
2559                         const int layerId,
2560                         LayerShapes& shapes)
2561     {
2562         LayersShapesMap inOutShapes;
2563         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2564         getLayerShapesRecursively(layerId, inOutShapes);
2565         shapes = inOutShapes[layerId];
2566     }
2567
2568     LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
2569     {
2570         return *std::max_element(pins.begin(), pins.end());
2571     }
2572
2573     Mat getBlob(const LayerPin& pin)
2574     {
2575         CV_TRACE_FUNCTION();
2576
2577         if (!pin.valid())
2578             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2579
2580         LayerData &ld = layers[pin.lid];
2581         if ((size_t)pin.oid >= ld.outputBlobs.size())
2582         {
2583             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %zu outputs, "
2584                                            "the #%d was requested", ld.name.c_str(),
2585                                            ld.outputBlobs.size(), pin.oid));
2586         }
2587         if (preferableTarget != DNN_TARGET_CPU)
2588         {
2589             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2590             // Transfer data to CPU if it's require.
2591             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2592         }
2593
2594         if (ld.outputBlobs[pin.oid].depth() == CV_16S)
2595         {
2596             convertFp16(ld.outputBlobs[pin.oid], output_blob);
2597             return output_blob;
2598         }
2599         else
2600             return ld.outputBlobs[pin.oid];
2601     }
2602
2603     Mat getBlob(String outputName)
2604     {
2605         return getBlob(getPinByAlias(outputName));
2606     }
2607
2608 #ifdef CV_CXX11
2609     AsyncArray getBlobAsync(const LayerPin& pin)
2610     {
2611         CV_TRACE_FUNCTION();
2612 #ifdef HAVE_INF_ENGINE
2613         if (!pin.valid())
2614             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2615
2616         LayerData &ld = layers[pin.lid];
2617         if ((size_t)pin.oid >= ld.outputBlobs.size())
2618         {
2619             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
2620                                            "the #%d was requested", ld.name.c_str(),
2621                                            (int)ld.outputBlobs.size(), (int)pin.oid));
2622         }
2623         if (preferableTarget != DNN_TARGET_CPU)
2624         {
2625             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2626             // Transfer data to CPU if it's require.
2627             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2628         }
2629         CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
2630
2631         Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
2632         return std::move(wrapper->futureMat);
2633 #else
2634         CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
2635 #endif
2636     }
2637
2638     AsyncArray getBlobAsync(String outputName)
2639     {
2640         return getBlobAsync(getPinByAlias(outputName));
2641     }
2642 #endif  // CV_CXX11
2643 };
2644
2645 Net::Net() : impl(new Net::Impl)
2646 {
2647 }
2648
2649 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
2650 {
2651 #ifndef HAVE_INF_ENGINE
2652     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
2653 #else
2654     InferenceEngine::CNNNetReader reader;
2655     reader.ReadNetwork(xml);
2656     reader.ReadWeights(bin);
2657
2658     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
2659
2660     std::vector<String> inputsNames;
2661     std::vector<MatShape> inp_shapes;
2662     for (auto& it : ieNet.getInputsInfo())
2663     {
2664         inputsNames.push_back(it.first);
2665         std::vector<size_t> dims = it.second->getTensorDesc().getDims();
2666         inp_shapes.push_back(std::vector<int>(dims.begin(), dims.end()));
2667     }
2668
2669     Net cvNet;
2670     cvNet.setInputsNames(inputsNames);
2671
2672     // set empty input to determine input shapes
2673     for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
2674     {
2675         cvNet.setInput(Mat(inp_shapes[inp_id], CV_32F), inputsNames[inp_id]);
2676     }
2677
2678     Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
2679     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
2680     for (auto& it : ieNet.getOutputsInfo())
2681     {
2682         Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
2683         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
2684         CV_Assert(ieLayer);
2685
2686         LayerParams lp;
2687         int lid = cvNet.addLayer(it.first, "", lp);
2688
2689         LayerData& ld = cvNet.impl->layers[lid];
2690         cvLayer->name = it.first;
2691         cvLayer->type = ieLayer->type;
2692         ld.layerInstance = cvLayer;
2693         ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
2694
2695         for (int i = 0; i < inputsNames.size(); ++i)
2696             cvNet.connect(0, i, lid, i);
2697     }
2698     cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
2699
2700     cvNet.impl->skipInfEngineInit = true;
2701     return cvNet;
2702 #endif  // HAVE_INF_ENGINE
2703 }
2704
2705 Net::~Net()
2706 {
2707 }
2708
2709 int Net::addLayer(const String &name, const String &type, LayerParams &params)
2710 {
2711     CV_TRACE_FUNCTION();
2712
2713     if (impl->getLayerId(name) >= 0)
2714     {
2715         CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
2716         return -1;
2717     }
2718
2719     int id = ++impl->lastLayerId;
2720     impl->layerNameToId.insert(std::make_pair(name, id));
2721     impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
2722
2723     return id;
2724 }
2725
2726 int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
2727 {
2728     CV_TRACE_FUNCTION();
2729
2730     int prvLid = impl->lastLayerId;
2731     int newLid = this->addLayer(name, type, params);
2732     this->connect(prvLid, 0, newLid, 0);
2733     return newLid;
2734 }
2735
2736 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
2737 {
2738     CV_TRACE_FUNCTION();
2739
2740     impl->connect(outLayerId, outNum, inpLayerId, inpNum);
2741 }
2742
2743 void Net::connect(String _outPin, String _inPin)
2744 {
2745     CV_TRACE_FUNCTION();
2746
2747     LayerPin outPin = impl->getPinByAlias(_outPin);
2748     LayerPin inpPin = impl->getPinByAlias(_inPin);
2749
2750     CV_Assert(outPin.valid() && inpPin.valid());
2751
2752     impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
2753 }
2754
2755 Mat Net::forward(const String& outputName)
2756 {
2757     CV_TRACE_FUNCTION();
2758
2759     String layerName = outputName;
2760
2761     if (layerName.empty())
2762         layerName = getLayerNames().back();
2763
2764     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2765     impl->setUpNet(pins);
2766     impl->forwardToLayer(impl->getLayerData(layerName));
2767
2768     return impl->getBlob(layerName);
2769 }
2770
2771 AsyncArray Net::forwardAsync(const String& outputName)
2772 {
2773     CV_TRACE_FUNCTION();
2774 #ifdef CV_CXX11
2775     String layerName = outputName;
2776
2777     if (layerName.empty())
2778         layerName = getLayerNames().back();
2779
2780     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2781     impl->setUpNet(pins);
2782
2783     if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
2784         CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
2785
2786     impl->isAsync = true;
2787     impl->forwardToLayer(impl->getLayerData(layerName));
2788     impl->isAsync = false;
2789
2790     return impl->getBlobAsync(layerName);
2791 #else
2792     CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
2793 #endif  // CV_CXX11
2794 }
2795
2796 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
2797 {
2798     CV_TRACE_FUNCTION();
2799
2800     String layerName = outputName;
2801
2802     if (layerName.empty())
2803         layerName = getLayerNames().back();
2804
2805     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2806     impl->setUpNet(pins);
2807     impl->forwardToLayer(impl->getLayerData(layerName));
2808
2809     LayerPin pin = impl->getPinByAlias(layerName);
2810     LayerData &ld = impl->layers[pin.lid];
2811
2812     if (outputBlobs.isUMat())
2813     {
2814         impl->getBlob(layerName).copyTo(outputBlobs);
2815     }
2816     else if (outputBlobs.isMat())
2817     {
2818         outputBlobs.assign(impl->getBlob(layerName));
2819     }
2820     else if (outputBlobs.isMatVector())
2821     {
2822         if (impl->preferableTarget != DNN_TARGET_CPU)
2823         {
2824             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2825             {
2826                 CV_Assert(!ld.outputBlobsWrappers[i].empty());
2827                 ld.outputBlobsWrappers[i]->copyToHost();
2828             }
2829         }
2830         if (ld.outputBlobs[0].depth() == CV_32F)
2831         {
2832             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2833             outputvec = ld.outputBlobs;
2834         } else {
2835             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2836             outputvec.resize(ld.outputBlobs.size());
2837             for (int i = 0; i < outputvec.size(); i++)
2838                 convertFp16(ld.outputBlobs[i], outputvec[i]);
2839         }
2840     }
2841     else if (outputBlobs.isUMatVector())
2842     {
2843         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
2844
2845         if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
2846             IS_DNN_OPENCL_TARGET(impl->preferableTarget))
2847         {
2848             if (impl->preferableTarget == DNN_TARGET_OPENCL)
2849                 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2850             else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
2851             {
2852                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2853                 outputvec.resize(out_vec.size());
2854                 for (int i = 0; i < out_vec.size(); i++)
2855                     convertFp16(out_vec[i], outputvec[i]);
2856             }
2857         }
2858         else
2859         {
2860             outputvec.resize(ld.outputBlobs.size());
2861             for (int i = 0; i < outputvec.size(); ++i)
2862                 ld.outputBlobs[i].copyTo(outputvec[i]);
2863         }
2864     }
2865 }
2866
2867 void Net::forward(OutputArrayOfArrays outputBlobs,
2868                   const std::vector<String>& outBlobNames)
2869 {
2870     CV_TRACE_FUNCTION();
2871
2872     std::vector<LayerPin> pins;
2873     for (int i = 0; i < outBlobNames.size(); i++)
2874     {
2875         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
2876     }
2877
2878     impl->setUpNet(pins);
2879
2880     LayerPin out = impl->getLatestLayerPin(pins);
2881
2882     impl->forwardToLayer(impl->getLayerData(out.lid));
2883
2884     std::vector<Mat> matvec;
2885     for (int i = 0; i < pins.size(); i++)
2886     {
2887         matvec.push_back(impl->getBlob(pins[i]));
2888     }
2889
2890     std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2891     outputvec = matvec;
2892 }
2893
2894 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
2895                      const std::vector<String>& outBlobNames)
2896 {
2897     CV_TRACE_FUNCTION();
2898
2899     std::vector<LayerPin> pins;
2900     for (int i = 0; i < outBlobNames.size(); i++)
2901     {
2902         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
2903     }
2904
2905     impl->setUpNet(pins);
2906
2907     LayerPin out = impl->getLatestLayerPin(pins);
2908
2909     impl->forwardToLayer(impl->getLayerData(out.lid));
2910
2911     outputBlobs.resize(outBlobNames.size());
2912     for (int i = 0; i < outBlobNames.size(); i++)
2913     {
2914         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
2915         outputBlobs[i].resize(lp.size());
2916         for (int j = 0; j < lp.size(); j++)
2917         {
2918             outputBlobs[i][j] = impl->getBlob(lp[j]);
2919         }
2920     }
2921 }
2922
2923 void Net::setPreferableBackend(int backendId)
2924 {
2925     CV_TRACE_FUNCTION();
2926     CV_TRACE_ARG(backendId);
2927
2928     if( impl->preferableBackend != backendId )
2929     {
2930         impl->preferableBackend = backendId;
2931         impl->netWasAllocated = false;
2932         impl->clear();
2933     }
2934 }
2935
2936 void Net::setPreferableTarget(int targetId)
2937 {
2938     CV_TRACE_FUNCTION();
2939     CV_TRACE_ARG(targetId);
2940
2941     if( impl->preferableTarget != targetId )
2942     {
2943         impl->preferableTarget = targetId;
2944         if (IS_DNN_OPENCL_TARGET(targetId))
2945         {
2946 #ifndef HAVE_OPENCL
2947 #ifdef HAVE_INF_ENGINE
2948             if (impl->preferableBackend == DNN_BACKEND_OPENCV)
2949 #else
2950             if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
2951                 impl->preferableBackend == DNN_BACKEND_OPENCV)
2952 #endif  // HAVE_INF_ENGINE
2953                 impl->preferableTarget = DNN_TARGET_CPU;
2954 #else
2955             bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
2956             if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
2957                 impl->preferableTarget = DNN_TARGET_OPENCL;
2958 #endif
2959         }
2960         impl->netWasAllocated = false;
2961         impl->clear();
2962     }
2963 }
2964
2965 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
2966 {
2967     CV_TRACE_FUNCTION();
2968
2969     impl->netInputLayer->setNames(inputBlobNames);
2970 }
2971
2972 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
2973 {
2974     CV_TRACE_FUNCTION();
2975     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
2976
2977     LayerPin pin;
2978     pin.lid = 0;
2979     pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
2980
2981     if (!pin.valid())
2982         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
2983
2984     LayerData &ld = impl->layers[pin.lid];
2985     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
2986     ld.outputBlobs.resize(numInputs);
2987     ld.outputBlobsWrappers.resize(numInputs);
2988     impl->netInputLayer->inputsData.resize(numInputs);
2989     impl->netInputLayer->scaleFactors.resize(numInputs);
2990     impl->netInputLayer->means.resize(numInputs);
2991
2992     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
2993     Mat blob_ = blob.getMat();
2994     bool oldShape = prevShape == shape(blob_);
2995     if (oldShape)
2996     {
2997         blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
2998     }
2999     else
3000     {
3001         ld.outputBlobs[pin.oid] = blob_.clone();
3002         impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
3003     }
3004
3005     if (!ld.outputBlobsWrappers[pin.oid].empty())
3006     {
3007         ld.outputBlobsWrappers[pin.oid]->setHostDirty();
3008     }
3009     impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
3010     impl->netInputLayer->means[pin.oid] = mean;
3011     impl->netWasAllocated = impl->netWasAllocated && oldShape;
3012 }
3013
3014 Mat Net::getParam(LayerId layer, int numParam)
3015 {
3016     LayerData &ld = impl->getLayerData(layer);
3017     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3018     CV_Assert(numParam < (int)layerBlobs.size());
3019     return layerBlobs[numParam];
3020 }
3021
3022 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
3023 {
3024     LayerData &ld = impl->getLayerData(layer);
3025
3026     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3027     CV_Assert(numParam < (int)layerBlobs.size());
3028     //we don't make strong checks, use this function carefully
3029     layerBlobs[numParam] = blob;
3030 }
3031
3032 int Net::getLayerId(const String &layer)
3033 {
3034     return impl->getLayerId(layer);
3035 }
3036
3037 String parseLayerParams(const String& name, const LayerParams& lp) {
3038     DictValue param = lp.get(name);
3039     std::ostringstream out;
3040     out << name << " ";
3041     switch (param.size()) {
3042         case 1: out << ": "; break;
3043         case 2: out << "(HxW): "; break;
3044         case 3: out << "(DxHxW): "; break;
3045         default: CV_Error(Error::StsNotImplemented, format("Unsupported %s size = %d", name.c_str(), param.size()));
3046     }
3047     for (size_t i = 0; i < param.size() - 1; i++) {
3048         out << param.get<int>(i) << " x ";
3049     }
3050     out << param.get<int>(param.size() - 1) << "\\l";
3051     return out.str();
3052 }
3053
3054 String Net::dump()
3055 {
3056     CV_Assert(!empty());
3057
3058     if (impl->netInputLayer->inputsData.empty())
3059         CV_Error(Error::StsError, "Requested set input");
3060
3061     if (!impl->netWasAllocated)
3062         impl->setUpNet();
3063
3064     std::ostringstream out;
3065     std::map<int, LayerData>& map = impl->layers;
3066     int prefBackend = impl->preferableBackend;
3067     std::vector<std::vector<int> > skippedLayers;
3068     std::vector<int> skipId;
3069     std::vector<int> allLayers(map.size(), -1);
3070     int idPrev = -1;
3071     Ptr<BackendNode> prevNode;
3072     for (std::map<int, LayerData>::reverse_iterator rit = map.rbegin(); rit != map.rend(); ++rit)
3073     {
3074         std::map<int, Ptr<BackendNode> >::iterator itBackend = rit->second.backendNodes.find(prefBackend);
3075         if (prefBackend == DNN_BACKEND_OPENCV || itBackend == rit->second.backendNodes.end() ||
3076             itBackend->second.empty())
3077         {
3078                 if (rit->second.skip)
3079                     skipId.push_back(rit->first);
3080                 else if (!skipId.empty())
3081                 {
3082                     if (prefBackend == DNN_BACKEND_OPENCV || prevNode.empty())
3083                         skipId.push_back(rit->first);
3084                     else if (idPrev != -1)
3085                         skipId.push_back(idPrev);
3086
3087                     std::sort(skipId.begin(), skipId.end());
3088                     for (int i = 0; i < skipId.size(); i++) {
3089                         allLayers[skipId[i]] = skippedLayers.size();
3090                     }
3091                     skippedLayers.push_back(skipId);
3092                     skipId.clear();
3093                 }
3094         }
3095         else
3096         {
3097             if (itBackend->second == prevNode)
3098                 skipId.push_back(idPrev);
3099             else if (!skipId.empty())
3100             {
3101                 skipId.push_back(idPrev);
3102                 std::sort(skipId.begin(), skipId.end());
3103                 for (int i = 0; i < skipId.size(); i++) {
3104                     allLayers[skipId[i]] = skippedLayers.size();
3105                 }
3106                 skippedLayers.push_back(skipId);
3107                 skipId.clear();
3108             }
3109             idPrev = rit->first;
3110             prevNode = itBackend->second;
3111         }
3112     }
3113     String colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462"};
3114     String backend;
3115     switch (prefBackend) {
3116         case DNN_BACKEND_DEFAULT: backend = "DEFAULT/"; break;
3117         case DNN_BACKEND_HALIDE: backend = "HALIDE/"; break;
3118         case DNN_BACKEND_INFERENCE_ENGINE: backend = "DLIE/"; break;
3119         case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
3120     }
3121     out << "digraph G {" << '\n';
3122     // Add nodes
3123     for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3124     {
3125         String name = it->second.params.name;
3126         if (allLayers[it->first] == -1 && !name.empty()) {
3127             out << "    " << "\"" << name << "\"" << " [label=\"";
3128             skipId.clear();
3129             skipId.push_back(it->first);
3130         }
3131         else if (name.empty() || it->first != skippedLayers[allLayers[it->first]][0])
3132             continue;
3133         else { // first node in cluster : it->first == skippedLayers[allLayers[it->first]][0]
3134             int cluster = allLayers[it->first];
3135             out << "    " << "\"" << "cluster_" << cluster << "\"" << " [label=\"{";
3136             skipId = skippedLayers[allLayers[it->first]]; // vertices in current cluster
3137         }
3138         for (int i = 0; i < skipId.size(); i++)
3139         {
3140             LayerParams& lp = map[skipId[i]].params;
3141             if (!lp.name.empty()) {
3142                 if (i > 0) {
3143                     out << " | ";
3144                 }
3145                 out << lp.name << "\\n" << lp.type << "\\n";
3146                 if (lp.has("kernel_size")) {
3147                     String kernel = parseLayerParams("kernel_size", lp);
3148                     out << kernel;
3149                 } else if (lp.has("kernel_h") && lp.has("kernel_w")) {
3150                     DictValue h = lp.get("kernel_h");
3151                     DictValue w = lp.get("kernel_w");
3152                     out << "kernel (HxW): " << h << " x " << w << "\\l";
3153                 }
3154                 if (lp.has("stride")) {
3155                     String stride = parseLayerParams("stride", lp);
3156                     out << stride;
3157                 } else if (lp.has("stride_h") && lp.has("stride_w")) {
3158                     DictValue h = lp.get("stride_h");
3159                     DictValue w = lp.get("stride_w");
3160                     out << "stride (HxW): " << h << " x " << w << "\\l";
3161                 }
3162                 if (lp.has("dilation")) {
3163                     String dilation = parseLayerParams("dilation", lp);
3164                     out << dilation;
3165                 } else if (lp.has("dilation_h") && lp.has("dilation_w")) {
3166                     DictValue h = lp.get("dilation_h");
3167                     DictValue w = lp.get("dilation_w");
3168                     out << "dilation (HxW): " << h << " x " << w << "\\l";
3169                 }
3170                 if (lp.has("pad")) {
3171                     DictValue pad = lp.get("pad");
3172                     out << "pad ";
3173                     switch (pad.size()) {
3174                         case 1: out << ": " << pad << "\\l"; break;
3175                         case 2: out << "(HxW): (" << pad.get<int>(0) << " x " << pad.get<int>(1) << ")" << "\\l"; break;
3176                         case 4: out << "(HxW): (" << pad.get<int>(0) << ", " << pad.get<int>(2) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(3) << ")" << "\\l"; break;
3177                         case 6: out << "(DxHxW): (" << pad.get<int>(0) << ", " << pad.get<int>(3) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(4)
3178                                 << ") x (" << pad.get<int>(2) << ", " << pad.get<int>(5) << ")" << "\\l"; break;
3179                         default: CV_Error(Error::StsNotImplemented,  format("Unsupported pad size = %d", pad.size()));
3180                     }
3181                  } else if (lp.has("pad_l") && lp.has("pad_t") && lp.has("pad_r") && lp.has("pad_b")) {
3182                      DictValue l = lp.get("pad_l");
3183                      DictValue t = lp.get("pad_t");
3184                      DictValue r = lp.get("pad_r");
3185                      DictValue b = lp.get("pad_b");
3186                      out << "pad (HxW): (" << t << ", " << b << ") x (" << l << ", " << r << ")" << "\\l";
3187                  }
3188                  else if (lp.has("pooled_w") || lp.has("pooled_h")) {
3189                      DictValue h = lp.get("pooled_h");
3190                      DictValue w = lp.get("pooled_w");
3191                      out << "pad (HxW): " << h << " x " << w << "\\l";
3192                  }
3193                  if (lp.has("pool")) {
3194                      out << "pool: " << lp.get("pool") << "\\l";
3195                  }
3196                  if (lp.has("global_pooling")) {
3197                      out << "global_pooling: " << lp.get("global_pooling") << "\\l";
3198                  }
3199                  if (lp.has("group")) {
3200                      out << "group: " << lp.get("group") << "\\l";
3201                  }
3202              }
3203          }
3204          if (!it->second.outputBlobs.empty())
3205              out << "output: " << it->second.outputBlobs[0].size << "\\l";
3206
3207          Ptr<BackendNode> layerBackend = it->second.backendNodes[prefBackend];
3208          out << (!layerBackend.empty() ? backend : "OCV/");
3209          int colorId = 0;
3210          switch (it->second.layerInstance->preferableTarget) {
3211              case DNN_TARGET_CPU: out << "CPU\\n"; colorId = layerBackend.empty() ? 0 : 5; break;
3212              case DNN_TARGET_OPENCL: out << "OCL\\n"; colorId = 1; break;
3213              case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16\\n"; colorId = 2; break;
3214              case DNN_TARGET_MYRIAD: out << "MYRIAD\\n"; colorId = 3; break;
3215              case DNN_TARGET_FPGA: out << "FPGA\\n"; colorId = 4; break;
3216          }
3217          out << ((skipId.size() == 1)? "\" " : " }\" ");
3218          out << "fillcolor=\"" << colors[colorId] << "\" ";
3219          out << "style=filled ";
3220          out << "shape=" << ((skipId.size() == 1)? "box" : "record") << "]" << '\n';
3221     }
3222     out << '\n';
3223     // Add edges
3224     int inputsSize = impl->netInputLayer->outNames.size();
3225     for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3226     {
3227         if (allLayers[it->first] == -1)  // node
3228         {
3229             for (int i = 0; i < it->second.consumers.size(); i++)
3230             {
3231                 int outId = it->second.consumers[i].lid;
3232                 if (it == map.begin() && inputsSize > 1)
3233                     out << "    " << "\"" << it->second.name << "_" << i << "\"" << " -> ";
3234                 else
3235                     out << "    " << "\"" << it->second.name << "\"" << " -> ";
3236                 if (allLayers[outId] == -1)  // node
3237                     out << "\"" << map[outId].name << "\"" << '\n';
3238                 else  // cluster
3239                     out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3240             }
3241         }
3242         else if (it->first == skippedLayers[allLayers[it->first]].back())  // edges from last layer in cluster
3243         {
3244             for (int i = 0; i < it->second.consumers.size(); i++)
3245             {
3246                 int outId = it->second.consumers[i].lid;
3247                 if (allLayers[outId] == -1) { // node
3248                     out << "    " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3249                     out << "\"" << map[outId].name << "\"" << '\n';
3250                 }
3251                 else if (allLayers[outId] != allLayers[it->first]) { // another cluster
3252                     out << "    " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3253                     out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3254                 }
3255             }
3256         }
3257     }
3258     out << "}";
3259     return out.str();
3260 }
3261
3262 void Net::dumpToFile(const String& path) {
3263     std::ofstream file(path.c_str());
3264     file << dump();
3265     file.close();
3266 }
3267
3268 Ptr<Layer> Net::getLayer(LayerId layerId)
3269 {
3270     LayerData &ld = impl->getLayerData(layerId);
3271     return ld.getLayerInstance();
3272 }
3273
3274 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
3275 {
3276     LayerData &ld = impl->getLayerData(layerId);
3277     if (!ld.layerInstance)
3278         CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str()));
3279
3280     std::vector<Ptr<Layer> > inputLayers;
3281     inputLayers.reserve(ld.inputLayersId.size());
3282     std::set<int>::iterator it;
3283     for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) {
3284         inputLayers.push_back(getLayer(*it));
3285     }
3286     return inputLayers;
3287 }
3288
3289 std::vector<String> Net::getLayerNames() const
3290 {
3291     std::vector<String> res;
3292     res.reserve(impl->layers.size());
3293
3294     Impl::MapIdToLayerData::iterator it;
3295     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3296     {
3297         if (it->second.id) //skip Data layer
3298             res.push_back(it->second.name);
3299     }
3300
3301     return res;
3302 }
3303
3304 bool Net::empty() const
3305 {
3306     return impl->layers.size() <= 1; //first layer is default Data layer
3307 }
3308
3309 std::vector<int> Net::getUnconnectedOutLayers() const
3310 {
3311     std::vector<int> layersIds;
3312
3313     Impl::MapIdToLayerData::iterator it;
3314     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3315     {
3316         int lid = it->first;
3317         LayerData &ld = it->second;
3318
3319         if (ld.requiredOutputs.size() == 0)
3320             layersIds.push_back(lid);
3321     }
3322
3323     return layersIds;
3324 }
3325
3326 std::vector<String> Net::getUnconnectedOutLayersNames() const
3327 {
3328     std::vector<int> ids = getUnconnectedOutLayers();
3329     const size_t n = ids.size();
3330     std::vector<String> names(n);
3331     for (size_t i = 0; i < n; ++i)
3332     {
3333         names[i] = impl->layers[ids[i]].name;
3334     }
3335     return names;
3336 }
3337
3338 void Net::getLayersShapes(const ShapesVec& netInputShapes,
3339                           std::vector<int>& layersIds,
3340                           std::vector<ShapesVec>& inLayersShapes,
3341                           std::vector<ShapesVec>& outLayersShapes) const
3342 {
3343     layersIds.clear();
3344     inLayersShapes.clear();
3345     outLayersShapes.clear();
3346
3347     Impl::LayersShapesMap inOutShapes;
3348     impl->getLayersShapes(netInputShapes, inOutShapes);
3349
3350     for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
3351         it != inOutShapes.end(); it++)
3352     {
3353         layersIds.push_back(it->first);
3354         inLayersShapes.push_back(it->second.in);
3355         outLayersShapes.push_back(it->second.out);
3356     }
3357 }
3358
3359 void Net::getLayersShapes(const MatShape& netInputShape,
3360                           std::vector<int>& layerIds,
3361                           std::vector<ShapesVec>& inLayersShapes,
3362                           std::vector<ShapesVec>& outLayersShapes) const
3363 {
3364     getLayersShapes(ShapesVec(1, netInputShape),
3365                     layerIds, inLayersShapes, outLayersShapes);
3366 }
3367
3368 void Net::getLayerShapes(const MatShape& netInputShape,
3369                          const int layerId,
3370                          ShapesVec& inLayerShapes,
3371                          ShapesVec& outLayerShapes) const
3372 {
3373     getLayerShapes(ShapesVec(1, netInputShape),
3374                    layerId, inLayerShapes, outLayerShapes);
3375
3376 }
3377
3378 void Net::getLayerShapes(const ShapesVec& netInputShapes,
3379                     const int layerId,
3380                     ShapesVec& inLayerShapes,
3381                     ShapesVec& outLayerShapes) const
3382 {
3383     LayerShapes shapes;
3384     impl->getLayerShapes(netInputShapes, layerId, shapes);
3385     inLayerShapes = shapes.in;
3386     outLayerShapes = shapes.out;
3387 }
3388
3389 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
3390 {
3391     CV_TRACE_FUNCTION();
3392
3393     int64 flops = 0;
3394     std::vector<int> ids;
3395     std::vector<std::vector<MatShape> > inShapes, outShapes;
3396     getLayersShapes(netInputShapes, ids, inShapes, outShapes);
3397     CV_Assert(inShapes.size() == outShapes.size());
3398     CV_Assert(inShapes.size() == ids.size());
3399
3400     for(int i = 0; i < ids.size(); i++)
3401     {
3402         flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
3403                                                                    outShapes[i]);
3404     }
3405
3406     return flops;
3407 }
3408
3409 int64 Net::getFLOPS(const MatShape& netInputShape) const
3410 {
3411     return getFLOPS(std::vector<MatShape>(1, netInputShape));
3412 }
3413
3414 int64 Net::getFLOPS(const int layerId,
3415               const std::vector<MatShape>& netInputShapes) const
3416 {
3417     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3418     CV_Assert(layer != impl->layers.end());
3419
3420     LayerShapes shapes;
3421     impl->getLayerShapes(netInputShapes, layerId, shapes);
3422
3423     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
3424 }
3425
3426 int64 Net::getFLOPS(const int layerId,
3427               const MatShape& netInputShape) const
3428 {
3429     return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
3430 }
3431
3432 void Net::getLayerTypes(std::vector<String>& layersTypes) const
3433 {
3434     layersTypes.clear();
3435
3436     std::map<String, int> layers;
3437     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3438          it != impl->layers.end(); it++)
3439     {
3440         if (layers.find(it->second.type) == layers.end())
3441             layers[it->second.type] = 0;
3442         layers[it->second.type]++;
3443     }
3444
3445     for (std::map<String, int>::iterator it = layers.begin();
3446          it != layers.end(); it++)
3447     {
3448         layersTypes.push_back(it->first);
3449     }
3450 }
3451
3452 int Net::getLayersCount(const String& layerType) const
3453 {
3454     int count = 0;
3455     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3456          it != impl->layers.end(); it++)
3457     {
3458         if (it->second.type == layerType)
3459             count++;
3460     }
3461     return count;
3462 }
3463
3464 void Net::getMemoryConsumption(const int layerId,
3465                                const std::vector<MatShape>& netInputShapes,
3466                                size_t& weights, size_t& blobs) const
3467 {
3468     CV_TRACE_FUNCTION();
3469
3470     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3471     CV_Assert(layer != impl->layers.end());
3472
3473     weights = blobs = 0;
3474
3475     for(int i = 0; i < layer->second.params.blobs.size(); i++)
3476     {
3477         const Mat& weightsBlob = layer->second.params.blobs[i];
3478         weights += weightsBlob.total()*weightsBlob.elemSize();
3479     }
3480
3481     ShapesVec inLayerShapes, outLayerShapes;
3482     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
3483     for(int i = 0; i < outLayerShapes.size(); i++)
3484     {
3485         blobs += total(outLayerShapes[i]) * sizeof(float);
3486     }
3487 }
3488
3489 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3490                                size_t& weights, size_t& blobs) const
3491 {
3492     CV_TRACE_FUNCTION();
3493
3494     std::vector<int> layerIds;
3495     std::vector<size_t> w, b;
3496     getMemoryConsumption(netInputShapes, layerIds, w, b);
3497
3498     weights = blobs = 0;
3499     for(int i = 0; i < layerIds.size(); i++)
3500     {
3501         weights += w[i];
3502         blobs += b[i];
3503     }
3504 }
3505
3506 void Net::getMemoryConsumption(const int layerId,
3507                                const MatShape& netInputShape,
3508                                size_t& weights, size_t& blobs) const
3509 {
3510     getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
3511                          weights, blobs);
3512 }
3513
3514 void Net::getMemoryConsumption(const MatShape& netInputShape,
3515                                size_t& weights, size_t& blobs) const
3516 {
3517     getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
3518                          weights, blobs);
3519 }
3520
3521 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3522                                   std::vector<int>& layerIds, std::vector<size_t>& weights,
3523                                   std::vector<size_t>& blobs) const
3524 {
3525     CV_TRACE_FUNCTION();
3526
3527     layerIds.clear();
3528     weights.clear();
3529     blobs.clear();
3530
3531     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
3532
3533     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
3534
3535     for(int i = 0; i < layerIds.size(); i++)
3536     {
3537         int w = 0, b = 0;
3538         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
3539         CV_Assert(layer != impl->layers.end());
3540
3541         for(int j = 0; j < layer->second.params.blobs.size(); j++)
3542         {
3543             const Mat& weightsBlob = layer->second.params.blobs[j];
3544             w += weightsBlob.total()*weightsBlob.elemSize();
3545         }
3546
3547         for(int j = 0; j < outLayerShapes[i].size(); j++)
3548         {
3549             b += total(outLayerShapes[i][j]) * sizeof(float);
3550         }
3551
3552         weights.push_back(w);
3553         blobs.push_back(b);
3554     }
3555 }
3556
3557 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
3558                                std::vector<size_t>& weights, std::vector<size_t>& blobs) const
3559 {
3560     getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
3561                          weights, blobs);
3562 }
3563
3564 void Net::enableFusion(bool fusion)
3565 {
3566     if( impl->fusion != fusion )
3567     {
3568         impl->fusion = fusion;
3569         impl->netWasAllocated = false;
3570         impl->clear();
3571     }
3572 }
3573
3574 void Net::setHalideScheduler(const String& scheduler)
3575 {
3576     CV_TRACE_FUNCTION();
3577     CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
3578
3579     impl->halideConfigFile = scheduler;
3580 }
3581
3582 int64 Net::getPerfProfile(std::vector<double>& timings)
3583 {
3584     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
3585     int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
3586     return total;
3587 }
3588
3589 //////////////////////////////////////////////////////////////////////////
3590
3591 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
3592
3593 Layer::Layer(const LayerParams &params)
3594     : blobs(params.blobs), name(params.name), type(params.type)
3595 {
3596     preferableTarget = DNN_TARGET_CPU;
3597 }
3598
3599 void Layer::setParamsFrom(const LayerParams &params)
3600 {
3601     blobs = params.blobs;
3602     name = params.name;
3603     type = params.type;
3604 }
3605
3606 int Layer::inputNameToIndex(String)
3607 {
3608     return -1;
3609 }
3610
3611 int Layer::outputNameToIndex(const String&)
3612 {
3613     return 0;
3614 }
3615
3616 bool Layer::supportBackend(int backendId)
3617 {
3618     return backendId == DNN_BACKEND_OPENCV;
3619 }
3620
3621 Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
3622 {
3623     CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
3624                                        " layers is not defined.");
3625     return Ptr<BackendNode>();
3626 }
3627
3628 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
3629 {
3630     CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
3631                                        " layers is not defined.");
3632     return Ptr<BackendNode>();
3633 }
3634
3635 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
3636 {
3637     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
3638                                        " layers is not defined.");
3639     return Ptr<BackendNode>();
3640 }
3641
3642 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
3643                                  const std::vector<Mat> &outputs, int targetId) const
3644 {
3645 #ifdef  HAVE_HALIDE
3646     CV_TRACE_FUNCTION();
3647
3648     Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
3649                 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
3650     Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
3651
3652     int outW, outH, outC, outN;
3653     getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
3654
3655     if (targetId == DNN_TARGET_CPU)
3656     {
3657         if (outW == 1 && outH == 1)
3658         {
3659             if (outC + outN == 1)
3660                 return;
3661
3662             if (outC > 8)
3663               top.split(c, co, ci, 8)
3664                  .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3665                  .parallel(tile)
3666                  .vectorize(ci, 8);
3667             else
3668               top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
3669                  .parallel(tile);
3670         }
3671         else
3672         {
3673             if (outH > 2)
3674             {
3675                 top.reorder(x, c, y)
3676                    .split(y, yo, yi, 2)
3677                    .fuse(yo, n, tile)
3678                    .parallel(tile)
3679                    .unroll(yi)
3680                    .vectorize(x, outW >= 16 ? 16 : outW);
3681             }
3682         }
3683     }
3684     else if (targetId == DNN_TARGET_OPENCL)
3685     {
3686         if (outW == 1 && outH == 1)
3687         {
3688             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
3689             top.split(c, co, ci, c_split)
3690                .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3691                .gpu_blocks(tile)
3692                .gpu_threads(ci);
3693         }
3694         else
3695         {
3696             int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
3697             int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
3698             // Supported vectorization widths: 2, 3, 4, 8, 16
3699             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
3700             top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
3701                .split(c, co, ci, c_split)
3702                .gpu_blocks(xo, yo, co)
3703                .gpu_threads(xi, yi)
3704                .reorder(xi, yi, ci, xo, yo, co)
3705                .vectorize(ci);
3706         }
3707     }
3708     else
3709         CV_Error(Error::StsNotImplemented, "Unknown target identifier");
3710 #endif  // HAVE_HALIDE
3711 }
3712
3713 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
3714 {
3715     return Ptr<BackendNode>();
3716 }
3717
3718 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
3719 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
3720 void Layer::getScaleShift(Mat& scale, Mat& shift) const
3721 {
3722     scale = Mat();
3723     shift = Mat();
3724 }
3725
3726 void Layer::unsetAttached()
3727 {
3728     setActivation(Ptr<ActivationLayer>());
3729 }
3730
3731 template <typename T>
3732 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
3733 {
3734     pv.resize(v.size());
3735     for (size_t i = 0; i < v.size(); i++)
3736         pv[i] = const_cast<T*>(&v[i]);
3737 }
3738
3739 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
3740 {
3741     CV_TRACE_FUNCTION();
3742     this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
3743 }
3744
3745 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
3746 {
3747     CV_UNUSED(input);CV_UNUSED(output);
3748 }
3749
3750 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
3751 {
3752     CV_TRACE_FUNCTION();
3753     std::vector<Mat> inputs, outputs;
3754     inputs_arr.getMatVector(inputs);
3755     outputs_arr.getMatVector(outputs);
3756
3757     std::vector<Mat*> inputsp;
3758     vecToPVec(inputs, inputsp);
3759     this->finalize(inputsp, outputs);
3760 }
3761
3762 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
3763 {
3764     CV_TRACE_FUNCTION();
3765
3766     std::vector<Mat> outputs;
3767     this->finalize(inputs, outputs);
3768     return outputs;
3769 }
3770
3771 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
3772 {
3773     // We kept this method for compatibility. DNN calls it now only to support users' implementations.
3774 }
3775
3776 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3777 {
3778     CV_TRACE_FUNCTION();
3779     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3780
3781     Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
3782 }
3783
3784 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3785 {
3786     CV_TRACE_FUNCTION();
3787     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3788
3789     if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
3790     {
3791         std::vector<UMat> inputs;
3792         std::vector<UMat> outputs;
3793         std::vector<UMat> internals;
3794
3795         std::vector<UMat> orig_inputs;
3796         std::vector<UMat> orig_outputs;
3797         std::vector<UMat> orig_internals;
3798
3799         inputs_arr.getUMatVector(orig_inputs);
3800         outputs_arr.getUMatVector(orig_outputs);
3801         internals_arr.getUMatVector(orig_internals);
3802
3803         inputs.resize(orig_inputs.size());
3804         for (size_t i = 0; i < orig_inputs.size(); i++)
3805             convertFp16(orig_inputs[i], inputs[i]);
3806
3807         outputs.resize(orig_outputs.size());
3808         for (size_t i = 0; i < orig_outputs.size(); i++)
3809             outputs[i].create(shape(orig_outputs[i]), CV_32F);
3810
3811         internals.resize(orig_internals.size());
3812         for (size_t i = 0; i < orig_internals.size(); i++)
3813             internals[i].create(shape(orig_internals[i]), CV_32F);
3814
3815         forward(inputs, outputs, internals);
3816
3817         for (size_t i = 0; i < outputs.size(); i++)
3818             convertFp16(outputs[i], orig_outputs[i]);
3819
3820         // sync results back
3821         outputs_arr.assign(orig_outputs);
3822         internals_arr.assign(orig_internals);
3823         return;
3824     }
3825     std::vector<Mat> inpvec;
3826     std::vector<Mat> outputs;
3827     std::vector<Mat> internals;
3828
3829     inputs_arr.getMatVector(inpvec);
3830     outputs_arr.getMatVector(outputs);
3831     internals_arr.getMatVector(internals);
3832
3833     std::vector<Mat*> inputs(inpvec.size());
3834     for (int i = 0; i < inpvec.size(); i++)
3835         inputs[i] = &inpvec[i];
3836
3837     this->forward(inputs, outputs, internals);
3838
3839     // sync results back
3840     outputs_arr.assign(outputs);
3841     internals_arr.assign(internals);
3842 }
3843
3844 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
3845 {
3846     CV_TRACE_FUNCTION();
3847
3848     this->finalize(inputs, outputs);
3849     this->forward(inputs, outputs, internals);
3850 }
3851
3852 Layer::~Layer() {}
3853
3854 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
3855                             const int requiredOutputs,
3856                             std::vector<MatShape> &outputs,
3857                             std::vector<MatShape> &internals) const
3858 {
3859     CV_Assert(inputs.size());
3860     outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
3861     return false;
3862 }
3863
3864 //////////////////////////////////////////////////////////////////////////
3865
3866 static Mutex& getLayerFactoryMutex()
3867 {
3868     static Mutex* volatile instance = NULL;
3869     if (instance == NULL)
3870     {
3871         cv::AutoLock lock(getInitializationMutex());
3872         if (instance == NULL)
3873             instance = new Mutex();
3874     }
3875     return *instance;
3876 }
3877
3878 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
3879
3880 static LayerFactory_Impl& getLayerFactoryImpl_()
3881 {
3882     static LayerFactory_Impl impl;
3883     return impl;
3884 }
3885
3886 static LayerFactory_Impl& getLayerFactoryImpl()
3887 {
3888     static LayerFactory_Impl* volatile instance = NULL;
3889     if (instance == NULL)
3890     {
3891         cv::AutoLock lock(getLayerFactoryMutex());
3892         if (instance == NULL)
3893         {
3894             instance = &getLayerFactoryImpl_();
3895             initializeLayerFactory();
3896         }
3897     }
3898     return *instance;
3899 }
3900
3901 void LayerFactory::registerLayer(const String &type, Constructor constructor)
3902 {
3903     CV_TRACE_FUNCTION();
3904     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3905
3906     cv::AutoLock lock(getLayerFactoryMutex());
3907     String type_ = toLowerCase(type);
3908     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3909
3910     if (it != getLayerFactoryImpl().end())
3911     {
3912         if (it->second.back() == constructor)
3913             CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered");
3914         it->second.push_back(constructor);
3915     }
3916     getLayerFactoryImpl().insert(std::make_pair(type_, std::vector<Constructor>(1, constructor)));
3917 }
3918
3919 void LayerFactory::unregisterLayer(const String &type)
3920 {
3921     CV_TRACE_FUNCTION();
3922     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3923
3924     cv::AutoLock lock(getLayerFactoryMutex());
3925     String type_ = toLowerCase(type);
3926
3927     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3928     if (it != getLayerFactoryImpl().end())
3929     {
3930         if (it->second.size() > 1)
3931             it->second.pop_back();
3932         else
3933             getLayerFactoryImpl().erase(it);
3934     }
3935 }
3936
3937 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
3938 {
3939     CV_TRACE_FUNCTION();
3940     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3941
3942     cv::AutoLock lock(getLayerFactoryMutex());
3943     String type_ = toLowerCase(type);
3944     LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_);
3945
3946     if (it != getLayerFactoryImpl().end())
3947     {
3948         CV_Assert(!it->second.empty());
3949         return it->second.back()(params);
3950     }
3951     else
3952     {
3953         return Ptr<Layer>(); //NULL
3954     }
3955 }
3956
3957 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
3958
3959 BackendNode::~BackendNode() {};
3960
3961 BackendWrapper::BackendWrapper(int backendId, int targetId)
3962     : backendId(backendId), targetId(targetId) {}
3963
3964 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
3965 {
3966     CV_Error(Error::StsNotImplemented,
3967              "Constructor of backend wrapper must be implemented");
3968 }
3969
3970 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
3971 {
3972     CV_Error(Error::StsNotImplemented,
3973              "Constructor of backend wrapper must be implemented");
3974 }
3975
3976 BackendWrapper::~BackendWrapper() {}
3977
3978 Net readNet(const String& _model, const String& _config, const String& _framework)
3979 {
3980     String framework = toLowerCase(_framework);
3981     String model = _model;
3982     String config = _config;
3983     const std::string modelExt = model.substr(model.rfind('.') + 1);
3984     const std::string configExt = config.substr(config.rfind('.') + 1);
3985     if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
3986                                 modelExt == "prototxt" || configExt == "prototxt")
3987     {
3988         if (modelExt == "prototxt" || configExt == "caffemodel")
3989             std::swap(model, config);
3990         return readNetFromCaffe(config, model);
3991     }
3992     if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
3993                                      modelExt == "pbtxt" || configExt == "pbtxt")
3994     {
3995         if (modelExt == "pbtxt" || configExt == "pb")
3996             std::swap(model, config);
3997         return readNetFromTensorflow(model, config);
3998     }
3999     if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
4000                                 configExt == "t7" || configExt == "net")
4001     {
4002         return readNetFromTorch(model.empty() ? config : model);
4003     }
4004     if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
4005                                   modelExt == "cfg" || configExt == "cfg")
4006     {
4007         if (modelExt == "cfg" || configExt == "weights")
4008             std::swap(model, config);
4009         return readNetFromDarknet(config, model);
4010     }
4011     if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
4012                                modelExt == "xml" || configExt == "xml")
4013     {
4014         if (modelExt == "xml" || configExt == "bin")
4015             std::swap(model, config);
4016         return readNetFromModelOptimizer(config, model);
4017     }
4018     if (framework == "onnx" || modelExt == "onnx")
4019     {
4020         return readNetFromONNX(model);
4021     }
4022     CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
4023                                       model + (config.empty() ? "" : ", " + config));
4024 }
4025
4026 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
4027             const std::vector<uchar>& bufferConfig)
4028 {
4029     String framework = toLowerCase(_framework);
4030     if (framework == "caffe")
4031         return readNetFromCaffe(bufferConfig, bufferModel);
4032     else if (framework == "tensorflow")
4033         return readNetFromTensorflow(bufferModel, bufferConfig);
4034     else if (framework == "darknet")
4035         return readNetFromDarknet(bufferConfig, bufferModel);
4036     else if (framework == "torch")
4037         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
4038     else if (framework == "dldt")
4039         CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
4040     CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
4041 }
4042
4043 Net readNetFromModelOptimizer(const String &xml, const String &bin)
4044 {
4045     return Net::readFromModelOptimizer(xml, bin);
4046 }
4047
4048 CV__DNN_INLINE_NS_END
4049 }} // namespace