modules/dnn/src/dnn.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Third party copyrights are property of their respective owners.
  15 //
  16 // Redistribution and use in source and binary forms, with or without modification,
  17 // are permitted provided that the following conditions are met:
  18 //
  19 //   * Redistribution's of source code must retain the above copyright notice,
  20 //     this list of conditions and the following disclaimer.
  21 //
  22 //   * Redistribution's in binary form must reproduce the above copyright notice,
  23 //     this list of conditions and the following disclaimer in the documentation
  24 //     and/or other materials provided with the distribution.
  25 //
  26 //   * The name of the copyright holders may not be used to endorse or promote products
  27 //     derived from this software without specific prior written permission.
  28 //
  29 // This software is provided by the copyright holders and contributors "as is" and
  30 // any express or implied warranties, including, but not limited to, the implied
  31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  32 // In no event shall the Intel Corporation or contributors be liable for any direct,
  33 // indirect, incidental, special, exemplary, or consequential damages
  34 // (including, but not limited to, procurement of substitute goods or services;
  35 // loss of use, data, or profits; or business interruption) however caused
  36 // and on any theory of liability, whether in contract, strict liability,
  37 // or tort (including negligence or otherwise) arising in any way out of
  38 // the use of this software, even if advised of the possibility of such damage.
  39 //
  40 //M*/
  41
  42 #include "precomp.hpp"
  43 #include "op_halide.hpp"
  44 #include "op_inf_engine.hpp"
  45 #include "op_vkcom.hpp"
  46 #include "halide_scheduler.hpp"
  47 #include <set>
  48 #include <algorithm>
  49 #include <iostream>
  50 #include <sstream>
  51 #include <fstream>
  52 #include <iterator>
  53 #include <numeric>
  54 #include <opencv2/dnn/shape_utils.hpp>
  55 #include <opencv2/imgproc.hpp>
  56
  57 #include <opencv2/core/utils/configuration.private.hpp>
  58 #include <opencv2/core/utils/logger.hpp>
  59
  60 namespace cv {
  61 namespace dnn {
  62 CV__DNN_INLINE_NS_BEGIN
  63
  64 // this option is useful to run valgrind memory errors detection
  65 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
  66
  67 #ifdef HAVE_OPENCL
  68 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
  69 #endif
  70
  71 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
  72 #ifdef HAVE_INF_ENGINE
  73     (size_t)DNN_BACKEND_INFERENCE_ENGINE
  74 #else
  75     (size_t)DNN_BACKEND_OPENCV
  76 #endif
  77 );
  78
  79 // Additional checks (slowdowns execution!)
  80 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
  81 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
  82 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
  83
  84 using std::vector;
  85 using std::map;
  86 using std::make_pair;
  87 using std::set;
  88
  89 //==================================================================================================
  90
  91 class BackendRegistry
  92 {
  93 public:
  94     typedef std::vector< std::pair<Backend, Target> > BackendsList;
  95     const BackendsList & getBackends() const { return backends; }
  96     static BackendRegistry & getRegistry()
  97     {
  98         static BackendRegistry impl;
  99         return impl;
 100     }
 101 private:
 102     BackendRegistry()
 103     {
 104 #ifdef HAVE_HALIDE
 105         backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_CPU));
 106 #  ifdef HAVE_OPENCL
 107         if (cv::ocl::useOpenCL())
 108             backends.push_back(std::make_pair(DNN_BACKEND_HALIDE, DNN_TARGET_OPENCL));
 109 #  endif
 110 #endif // HAVE_HALIDE
 111
 112 #ifdef HAVE_INF_ENGINE
 113         if (checkIETarget(DNN_TARGET_CPU))
 114             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_CPU));
 115         if (checkIETarget(DNN_TARGET_MYRIAD))
 116             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
 117         if (checkIETarget(DNN_TARGET_FPGA))
 118             backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_FPGA));
 119 #  ifdef HAVE_OPENCL
 120         if (cv::ocl::useOpenCL() && ocl::Device::getDefault().isIntel())
 121         {
 122             if (checkIETarget(DNN_TARGET_OPENCL))
 123                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL));
 124             if (checkIETarget(DNN_TARGET_OPENCL_FP16))
 125                 backends.push_back(std::make_pair(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
 126         }
 127 #  endif
 128 #endif // HAVE_INF_ENGINE
 129
 130 #ifdef HAVE_OPENCL
 131         if (cv::ocl::useOpenCL())
 132         {
 133             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL));
 134             backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL_FP16));
 135         }
 136 #endif
 137
 138         backends.push_back(std::make_pair(DNN_BACKEND_OPENCV, DNN_TARGET_CPU));
 139
 140 #ifdef HAVE_VULKAN
 141         if (haveVulkan())
 142             backends.push_back(std::make_pair(DNN_BACKEND_VKCOM, DNN_TARGET_VULKAN));
 143 #endif
 144     }
 145     static inline bool checkIETarget(int target)
 146     {
 147 #ifndef HAVE_INF_ENGINE
 148         return false;
 149 #else
 150         cv::dnn::Net net;
 151         cv::dnn::LayerParams lp;
 152         lp.set("kernel_size", 1);
 153         lp.set("num_output", 1);
 154         lp.set("bias_term", false);
 155         lp.type = "Convolution";
 156         lp.name = "testLayer";
 157         lp.blobs.push_back(Mat({1, 2, 1, 1}, CV_32F, Scalar(1)));
 158         net.addLayerToPrev(lp.name, lp.type, lp);
 159         net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
 160         net.setPreferableTarget(target);
 161         static int inpDims[] = {1, 2, 3, 4};
 162         net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
 163         try
 164         {
 165             net.forward();
 166         }
 167         catch(...)
 168         {
 169             return false;
 170         }
 171         return true;
 172 #endif
 173     }
 174
 175     BackendsList backends;
 176 };
 177
 178
 179 std::vector< std::pair<Backend, Target> > getAvailableBackends()
 180 {
 181     return BackendRegistry::getRegistry().getBackends();
 182 }
 183
 184 std::vector<Target> getAvailableTargets(Backend be)
 185 {
 186     if (be == DNN_BACKEND_DEFAULT)
 187         be = (Backend)PARAM_DNN_BACKEND_DEFAULT;
 188
 189     std::vector<Target> result;
 190     const BackendRegistry::BackendsList all_backends = getAvailableBackends();
 191     for(BackendRegistry::BackendsList::const_iterator i = all_backends.begin(); i != all_backends.end(); ++i )
 192     {
 193         if (i->first == be)
 194             result.push_back(i->second);
 195     }
 196     return result;
 197 }
 198
 199 //==================================================================================================
 200
 201 namespace
 202 {
 203     typedef std::vector<MatShape> ShapesVec;
 204
 205     struct LayerShapes
 206     {
 207         ShapesVec in, out, internal;
 208         // No guarantees that layer which support in-place computations
 209         // will be computed in-place (input.data_ptr == output.data_ptr).
 210         // If layer said that it could work in-place and layers after it
 211         // no longer use input blob, we'll set output = input.
 212         bool supportInPlace;
 213         LayerShapes() {supportInPlace = false;}
 214     };
 215 }
 216
 217 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
 218                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
 219 {
 220     CV_TRACE_FUNCTION();
 221     Mat blob;
 222     blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 223     return blob;
 224 }
 225
 226 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
 227                    const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
 228 {
 229     CV_TRACE_FUNCTION();
 230     std::vector<Mat> images(1, image.getMat());
 231     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 232 }
 233
 234 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
 235                    const Scalar& mean, bool swapRB, bool crop, int ddepth)
 236 {
 237     CV_TRACE_FUNCTION();
 238     Mat blob;
 239     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 240     return blob;
 241 }
 242
 243 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
 244                     Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
 245 {
 246     CV_TRACE_FUNCTION();
 247     CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
 248     if (ddepth == CV_8U)
 249     {
 250         CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
 251         CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
 252     }
 253
 254     std::vector<Mat> images;
 255     images_.getMatVector(images);
 256     CV_Assert(!images.empty());
 257     for (size_t i = 0; i < images.size(); i++)
 258     {
 259         Size imgSize = images[i].size();
 260         if (size == Size())
 261             size = imgSize;
 262         if (size != imgSize)
 263         {
 264             if(crop)
 265             {
 266               float resizeFactor = std::max(size.width / (float)imgSize.width,
 267                                             size.height / (float)imgSize.height);
 268               resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
 269               Rect crop(Point(0.5 * (images[i].cols - size.width),
 270                               0.5 * (images[i].rows - size.height)),
 271                         size);
 272               images[i] = images[i](crop);
 273             }
 274             else
 275               resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
 276         }
 277         if(images[i].depth() == CV_8U && ddepth == CV_32F)
 278             images[i].convertTo(images[i], CV_32F);
 279         Scalar mean = mean_;
 280         if (swapRB)
 281             std::swap(mean[0], mean[2]);
 282
 283         images[i] -= mean;
 284         images[i] *= scalefactor;
 285     }
 286
 287     size_t nimages = images.size();
 288     Mat image0 = images[0];
 289     int nch = image0.channels();
 290     CV_Assert(image0.dims == 2);
 291     if (nch == 3 || nch == 4)
 292     {
 293         int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
 294         blob_.create(4, sz, ddepth);
 295         Mat blob = blob_.getMat();
 296         Mat ch[4];
 297
 298         for(size_t i = 0; i < nimages; i++ )
 299         {
 300             const Mat& image = images[i];
 301             CV_Assert(image.depth() == blob_.depth());
 302             nch = image.channels();
 303             CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
 304             CV_Assert(image.size() == image0.size());
 305
 306             for( int j = 0; j < nch; j++ )
 307                 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
 308             if(swapRB)
 309                 std::swap(ch[0], ch[2]);
 310             split(image, ch);
 311         }
 312     }
 313     else
 314     {
 315        CV_Assert(nch == 1);
 316        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
 317        blob_.create(4, sz, ddepth);
 318        Mat blob = blob_.getMat();
 319
 320        for(size_t i = 0; i < nimages; i++ )
 321        {
 322            const Mat& image = images[i];
 323            CV_Assert(image.depth() == blob_.depth());
 324            nch = image.channels();
 325            CV_Assert(image.dims == 2 && (nch == 1));
 326            CV_Assert(image.size() == image0.size());
 327
 328            image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
 329        }
 330     }
 331 }
 332
 333 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
 334 {
 335     CV_TRACE_FUNCTION();
 336
 337     //A blob is a 4 dimensional matrix in floating point precision
 338     //blob_[0] = batchSize = nbOfImages
 339     //blob_[1] = nbOfChannels
 340     //blob_[2] = height
 341     //blob_[3] = width
 342     CV_Assert(blob_.depth() == CV_32F);
 343     CV_Assert(blob_.dims == 4);
 344
 345     images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
 346
 347     std::vector<Mat> vectorOfChannels(blob_.size[1]);
 348     for (int n = 0; n <  blob_.size[0]; ++n)
 349     {
 350         for (int c = 0; c < blob_.size[1]; ++c)
 351         {
 352             vectorOfChannels[c] = getPlane(blob_, n, c);
 353         }
 354         cv::merge(vectorOfChannels, images_.getMatRef(n));
 355     }
 356 }
 357
 358 class OpenCLBackendWrapper : public BackendWrapper
 359 {
 360 public:
 361     OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
 362     {
 363         m.copyTo(umat);
 364         host = &m;
 365         hostDirty = false;
 366     }
 367
 368     OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
 369         : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
 370     {
 371         Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
 372         CV_Assert(!base.empty());
 373
 374         host = &m;
 375
 376         int shape[] = {1, (int)base->umat.total()};
 377         umat = base->umat.reshape(1, 2, &shape[0])
 378                          .colRange(0, host->total())
 379                          .reshape(1, host->dims, &host->size[0]);
 380         hostDirty = false;
 381     }
 382
 383     static Ptr<BackendWrapper> create(Mat& m)
 384     {
 385         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
 386     }
 387
 388     static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
 389     {
 390         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
 391     }
 392
 393     static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
 394     {
 395         const int numWrappers = wrappers.size();
 396         std::vector<UMat> mats(wrappers.size());
 397         for (int i = 0; i < numWrappers; ++i)
 398         {
 399             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
 400             CV_Assert(!umatWrapper.empty());
 401             umatWrapper->copyToDevice();
 402             mats[i] = umatWrapper->umat;
 403         }
 404         return mats;
 405     }
 406
 407     // Replaces all umats in wrappers to specific ones.
 408     static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
 409                        const std::vector<UMat>& umats)
 410     {
 411         CV_Assert(wrappers.size() == umats.size());
 412         for (int i = 0, n = umats.size(); i < n; ++i)
 413         {
 414             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
 415             CV_Assert(!umatWrapper.empty());
 416             umatWrapper->umat = umats[i];
 417         }
 418     }
 419
 420     ~OpenCLBackendWrapper() {}
 421
 422     // Copies data from device to a host memory.
 423     virtual void copyToHost() CV_OVERRIDE
 424     {
 425         umat.copyTo(*host);
 426     }
 427
 428     virtual void setHostDirty() CV_OVERRIDE
 429     {
 430         hostDirty = true;
 431     };
 432
 433     void copyToDevice()
 434     {
 435         if (hostDirty)
 436         {
 437             host->copyTo(umat);
 438             hostDirty = false;
 439         }
 440     }
 441
 442 private:
 443     UMat umat;
 444     Mat* host;
 445     bool hostDirty;
 446 };
 447
 448 struct LayerPin
 449 {
 450     int lid;
 451     int oid;
 452
 453     LayerPin(int layerId = -1, int outputId = -1)
 454         : lid(layerId), oid(outputId) {}
 455
 456     bool valid() const
 457     {
 458         return (lid >= 0 && oid >= 0);
 459     }
 460
 461     bool equal(const LayerPin &r) const
 462     {
 463         return (lid == r.lid && oid == r.oid);
 464     }
 465
 466     bool operator<(const LayerPin &r) const
 467     {
 468         return lid < r.lid || (lid == r.lid && oid < r.oid);
 469     }
 470
 471     bool operator ==(const LayerPin &r) const
 472     {
 473         return lid == r.lid && oid == r.oid;
 474     }
 475 };
 476
 477 struct LayerData
 478 {
 479     LayerData() : id(-1), skip(false), flag(0) {}
 480     LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
 481         : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
 482     {
 483         CV_TRACE_FUNCTION();
 484
 485         //add logging info
 486         params.name = name;
 487         params.type = type;
 488     }
 489
 490     int id;
 491     String name;
 492     String type;
 493     LayerParams params;
 494
 495     std::vector<LayerPin> inputBlobsId;
 496     std::set<int> inputLayersId;
 497     std::set<int> requiredOutputs;
 498     std::vector<LayerPin> consumers;
 499     std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
 500     std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
 501     std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
 502
 503     Ptr<Layer> layerInstance;
 504     std::vector<Mat> outputBlobs;
 505     std::vector<Mat*> inputBlobs;
 506     std::vector<Mat> internals;
 507     // Computation nodes of implemented backends (except DEFAULT).
 508     std::map<int, Ptr<BackendNode> > backendNodes;
 509     // Flag for skip layer computation for specific backend.
 510     bool skip;
 511
 512     int flag;
 513
 514     Ptr<Layer> getLayerInstance()
 515     {
 516         CV_TRACE_FUNCTION();
 517         CV_TRACE_ARG_VALUE(type, "type", type.c_str());
 518
 519         if (layerInstance)
 520             return layerInstance;
 521
 522         layerInstance = LayerFactory::createLayerInstance(type, params);
 523         if (!layerInstance)
 524         {
 525             CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
 526         }
 527
 528         return layerInstance;
 529     }
 530 };
 531
 532 //fake layer containing network input blobs
 533 struct DataLayer : public Layer
 534 {
 535     DataLayer() : Layer()
 536     {
 537         skip = false;
 538     }
 539
 540     virtual bool supportBackend(int backendId) CV_OVERRIDE
 541     {
 542         return backendId == DNN_BACKEND_OPENCV ||
 543                (backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1);
 544     }
 545
 546     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 547     {
 548         CV_TRACE_FUNCTION();
 549         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 550
 551         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 552                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 553
 554         if (outputs_arr.depth() == CV_16S)
 555         {
 556             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 557             return;
 558         }
 559
 560         std::vector<Mat> outputs, internals;
 561         outputs_arr.getMatVector(outputs);
 562         internals_arr.getMatVector(internals);
 563
 564         // Supported modes:
 565         // | Input type | Output type |
 566         // |       fp32 |        fp32 |
 567         // |      uint8 |        fp32 |
 568         for (int i = 0; i < inputsData.size(); ++i)
 569         {
 570             double scale = scaleFactors[i];
 571             Scalar& mean = means[i];
 572             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
 573             CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
 574
 575             bool singleMean = true;
 576             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
 577             {
 578                 singleMean = mean[j] == mean[j - 1];
 579             }
 580
 581             if (singleMean)
 582             {
 583                 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
 584             }
 585             else
 586             {
 587                 for (int n = 0; n < inputsData[i].size[0]; ++n)
 588                     for (int c = 0; c < inputsData[i].size[1]; ++c)
 589                     {
 590                         Mat inp = getPlane(inputsData[i], n, c);
 591                         Mat out = getPlane(outputs[i], n, c);
 592                         inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
 593                     }
 594             }
 595         }
 596     }
 597
 598 #ifdef HAVE_OPENCL
 599     std::vector<Mat> tmp_expressions;
 600     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
 601     {
 602         // Supported modes:
 603         // | Input type | Output type |
 604         // |       fp32 |        fp32 |
 605         // |       fp32 |        fp16 |
 606         // |      uint8 |        fp32 |
 607         std::vector<UMat> outputs;
 608         outputs_.getUMatVector(outputs);
 609
 610         tmp_expressions.clear();
 611         for (int i = 0; i < inputsData.size(); ++i)
 612         {
 613             Mat inputData = inputsData[i];
 614
 615             double scale = scaleFactors[i];
 616             Scalar& mean = means[i];
 617
 618             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
 619             bool singleMean = true;
 620             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
 621             {
 622                 singleMean = mean[j] == mean[j - 1];
 623             }
 624
 625             if (outputs_.depth() == CV_16S)
 626             {
 627                 if (singleMean)
 628                 {
 629                     tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
 630                     convertFp16(tmp_expressions.back(), outputs[i]);
 631                 }
 632                 else
 633                 {
 634                     for (int n = 0; n < inputsData[i].size[0]; ++n)
 635                         for (int c = 0; c < inputsData[i].size[1]; ++c)
 636                         {
 637                             Mat inp = getPlane(inputsData[i], n, c);
 638
 639                             std::vector<cv::Range> plane(4, Range::all());
 640                             plane[0] = Range(n, n + 1);
 641                             plane[1] = Range(c, c + 1);
 642                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
 643
 644                             tmp_expressions.push_back(scale * (inp - mean[c]));
 645                             convertFp16(tmp_expressions.back(), out);
 646                         }
 647                 }
 648             }
 649             else
 650             {
 651                 CV_Assert(outputs_.depth() == CV_32F);
 652                 if (singleMean)
 653                 {
 654                     inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
 655                 }
 656                 else
 657                 {
 658                     for (int n = 0; n < inputsData[i].size[0]; ++n)
 659                         for (int c = 0; c < inputsData[i].size[1]; ++c)
 660                         {
 661                             Mat inp = getPlane(inputsData[i], n, c);
 662
 663                             std::vector<cv::Range> plane(4, Range::all());
 664                             plane[0] = Range(n, n + 1);
 665                             plane[1] = Range(c, c + 1);
 666                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
 667
 668                             inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
 669                         }
 670                 }
 671             }
 672         }
 673         return true;
 674     }
 675 #endif
 676
 677     int outputNameToIndex(const String& tgtName) CV_OVERRIDE
 678     {
 679         int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
 680         return (idx < (int)outNames.size()) ? idx : -1;
 681     }
 682
 683     void setNames(const std::vector<String> &names)
 684     {
 685         outNames.assign(names.begin(), names.end());
 686     }
 687
 688     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 689                          const int requiredOutputs,
 690                          std::vector<MatShape> &outputs,
 691                          std::vector<MatShape> &internals) const CV_OVERRIDE
 692     {
 693         CV_Assert(inputs.size() == requiredOutputs);
 694         outputs.assign(inputs.begin(), inputs.end());
 695         return false;
 696     }
 697
 698     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
 699     {
 700         std::vector<Mat> outputs;
 701         outputs_arr.getMatVector(outputs);
 702
 703         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
 704                   inputsData.size() == outputs.size());
 705         skip = true;
 706         for (int i = 0; skip && i < inputsData.size(); ++i)
 707         {
 708             if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
 709                 skip = false;
 710         }
 711     }
 712
 713     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
 714     {
 715 #ifdef HAVE_INF_ENGINE
 716         CV_CheckEQ(inputsData.size(), (size_t)1, "");
 717         CV_CheckEQ(inputsData[0].dims, 4, "");
 718         const size_t numChannels = inputsData[0].size[1];
 719         CV_Assert(numChannels <= 4);
 720
 721         // Scale
 722         InferenceEngine::TensorDesc td(InferenceEngine::Precision::FP32, {numChannels},
 723                                        InferenceEngine::Layout::C);
 724         auto weights = InferenceEngine::make_shared_blob<float>(td);
 725         weights->allocate();
 726
 727         float* weight_buf = weights->buffer().as<float*>();
 728         std::fill(weight_buf, weight_buf + numChannels, scaleFactors[0]);
 729
 730         // Mean subtraction
 731         auto biases = InferenceEngine::make_shared_blob<float>(td);
 732         biases->allocate();
 733         float* bias_buf = biases->buffer().as<float*>();
 734
 735         for (int i = 0; i < numChannels; ++i)
 736         {
 737             bias_buf[i] = -means[0][i] * scaleFactors[0];
 738         }
 739
 740         InferenceEngine::Builder::Layer ieLayer = InferenceEngine::Builder::ScaleShiftLayer(name);
 741         addConstantData("weights", weights, ieLayer);
 742         addConstantData("biases", biases, ieLayer);
 743         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 744 #endif  // HAVE_INF_ENGINE
 745         return Ptr<BackendNode>();
 746     }
 747
 748     std::vector<String> outNames;
 749     // Preprocessing parameters for each network's input.
 750     std::vector<double> scaleFactors;
 751     std::vector<Scalar> means;
 752     std::vector<Mat> inputsData;
 753     bool skip;
 754 };
 755
 756 struct BlobManager
 757 {
 758 public:
 759     // Increase references counter to layer output.
 760     void addReference(const LayerPin& lp)
 761     {
 762         std::map<LayerPin, int>::iterator it = refCounter.find(lp);
 763         if (it == refCounter.end())
 764             refCounter[lp] = 1;
 765         else
 766             it->second += 1;
 767     }
 768
 769     void addReferences(const std::vector<LayerPin>& pins)
 770     {
 771         for (int i = 0; i < pins.size(); i++)
 772         {
 773             addReference(pins[i]);
 774         }
 775     }
 776
 777     // Returns number of references to allocated memory that used in specific
 778     // layer blob.
 779     int numReferences(const LayerPin& lp)
 780     {
 781         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
 782         CV_Assert(mapIt != reuseMap.end());
 783         LayerPin memHost = mapIt->second;
 784
 785         std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
 786         CV_Assert(refIt != refCounter.end());
 787         return refIt->second;
 788     }
 789
 790     // Reuse data allocated in <host> inside the <user> blob.
 791     void reuse(const LayerPin& host, const LayerPin& user)
 792     {
 793         CV_Assert(reuseMap.find(user) == reuseMap.end());
 794         CV_Assert(reuseMap.find(host) != reuseMap.end());
 795         LayerPin memHost = reuseMap[host];
 796         reuseMap[user] = memHost;
 797         if (refCounter.find(memHost) != refCounter.end())
 798         {
 799             std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
 800             if (userRefIt != refCounter.end())
 801             {
 802                 refCounter[memHost] += userRefIt->second;
 803                 refCounter.erase(userRefIt);
 804             }
 805             else
 806                 refCounter[memHost] += 1;
 807         }
 808     }
 809
 810     // Decrease references counter to allocated memory inside specific blob.
 811     void releaseReference(const LayerPin& lp)
 812     {
 813         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
 814         CV_Assert(mapIt != reuseMap.end());
 815
 816         std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
 817         CV_Assert(refIt != refCounter.end());
 818         CV_Assert(refIt->second > 0);
 819         refIt->second -= 1;
 820     }
 821
 822     void releaseReferences(const std::vector<LayerPin>& pins)
 823     {
 824         for (int i = 0; i < pins.size(); i++)
 825         {
 826             releaseReference(pins[i]);
 827         }
 828     }
 829
 830     void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
 831     {
 832         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
 833         {
 834             Mat bestBlob;
 835             LayerPin bestBlobPin;
 836
 837             std::map<LayerPin, Mat>::iterator hostIt;
 838             std::map<LayerPin, int>::iterator refIt;
 839
 840             const int targetTotal = total(shape);
 841             int bestBlobTotal = INT_MAX;
 842
 843             for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
 844             {
 845                 refIt = refCounter.find(hostIt->first);
 846                 // Use only blobs that had references before because if not,
 847                 // it might be used as output.
 848                 if (refIt != refCounter.end() && refIt->second == 0)
 849                 {
 850                     Mat& unusedBlob = hostIt->second;
 851                     if (unusedBlob.total() >= targetTotal &&
 852                         unusedBlob.total() < bestBlobTotal)
 853                     {
 854                         bestBlobPin = hostIt->first;
 855                         bestBlob = unusedBlob;
 856                         bestBlobTotal = unusedBlob.total();
 857                     }
 858                 }
 859             }
 860             if (!bestBlob.empty())
 861             {
 862                 reuse(bestBlobPin, lp);
 863                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
 864                 return;
 865             }
 866         }
 867
 868         {
 869             // if dst already has been allocated with total(shape) elements,
 870             // it won't be recreated and pointer of dst.data remains the same.
 871             dst.create(shape, use_half ? CV_16S : CV_32F);
 872             addHost(lp, dst);
 873         }
 874     }
 875
 876     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
 877                                std::vector<LayerPin>& pinsForInternalBlobs,
 878                                bool use_half = false)
 879     {
 880         CV_TRACE_FUNCTION();
 881
 882         pinsForInternalBlobs.clear();
 883
 884         std::vector<Mat>& outputBlobs = ld.outputBlobs,
 885                 &internalBlobs = ld.internals;
 886
 887         const ShapesVec& outShapes = layerShapes.out,
 888                 internalShapes = layerShapes.internal;
 889
 890         outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
 891         internalBlobs.resize(internalShapes.size());
 892
 893         CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
 894
 895         // Check that layer could work in-place.
 896         bool inPlace = false;
 897         if (layerShapes.supportInPlace)
 898         {
 899             if (ld.inputBlobs.size() == 1)
 900             {
 901                 // Get number of references to the input memory.
 902                 int numRef = numReferences(ld.inputBlobsId[0]);
 903                 // If current layer is one and only customer of this blob.
 904                 inPlace = numRef == 1;
 905             }
 906         }
 907
 908         ShapesVec shapes(outShapes);
 909         shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
 910         std::vector<Mat*> blobs;
 911         for(int i = 0; i < outputBlobs.size(); i++)
 912         {
 913             blobs.push_back(&outputBlobs[i]);
 914         }
 915
 916         for(int i = 0; i < internalBlobs.size(); i++)
 917         {
 918             blobs.push_back(&internalBlobs[i]);
 919             if (total(internalShapes[i]))
 920             {
 921                 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
 922             }
 923         }
 924
 925         addReferences(pinsForInternalBlobs);
 926
 927         std::map<int, std::vector<int> > idxSizes;
 928         for(int i = 0; i < shapes.size(); i++)
 929         {
 930             idxSizes[total(shapes[i])].push_back(i);
 931         }
 932
 933         std::map<int, std::vector<int> >::reverse_iterator it;
 934         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
 935         {
 936             for(int j = 0; j < it->second.size(); j++)
 937             {
 938                 int index = it->second[j];
 939                 if (total(shapes[index]))
 940                 {
 941                     LayerPin blobPin(ld.id, index);
 942                     if (index < outShapes.size() && inPlace)
 943                     {
 944                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
 945                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
 946                         reuse(ld.inputBlobsId[0], blobPin);
 947                     }
 948                     else
 949                         reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
 950                 }
 951             }
 952         }
 953     }
 954
 955     // Clear internal state. Calls before an every reallocation.
 956     void reset()
 957     {
 958         CV_TRACE_FUNCTION();
 959
 960         refCounter.clear();
 961         reuseMap.clear();
 962         memHosts.clear();
 963     }
 964
 965 private:
 966     // Register allocated memory.
 967     void addHost(const LayerPin& lp, const Mat& mat)
 968     {
 969         CV_Assert(memHosts.find(lp) == memHosts.end());
 970         reuseMap[lp] = lp;
 971         memHosts[lp] = mat;
 972     }
 973
 974     std::map<LayerPin, int> refCounter;
 975     // Maps pin to origin blob (for whom memory was allocated firstly).
 976     // For origin blobs key == value.
 977     std::map<LayerPin, LayerPin> reuseMap;
 978     std::map<LayerPin, Mat> memHosts;
 979 };
 980
 981 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
 982 {
 983     if (backendId == DNN_BACKEND_OPENCV)
 984     {
 985         if (targetId == DNN_TARGET_CPU)
 986             return Ptr<BackendWrapper>();
 987         else if (IS_DNN_OPENCL_TARGET(targetId))
 988             return OpenCLBackendWrapper::create(m);
 989         else
 990             CV_Error(Error::StsNotImplemented, "Unknown target identifier");
 991     }
 992     else if (backendId == DNN_BACKEND_HALIDE)
 993     {
 994         CV_Assert(haveHalide());
 995 #ifdef HAVE_HALIDE
 996         return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
 997 #endif  // HAVE_HALIDE
 998     }
 999     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
1000     {
1001         CV_Assert(haveInfEngine());
1002 #ifdef HAVE_INF_ENGINE
1003         return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
1004 #endif  // HAVE_INF_ENGINE
1005     }
1006     else if (backendId == DNN_BACKEND_VKCOM)
1007     {
1008         CV_Assert(haveVulkan());
1009 #ifdef HAVE_VULKAN
1010         return Ptr<BackendWrapper>(new VkComBackendWrapper(m));
1011 #endif  // HAVE_VULKAN
1012     }
1013     else
1014         CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1015     return Ptr<BackendWrapper>();
1016 }
1017
1018 struct Net::Impl
1019 {
1020     typedef std::map<int, LayerShapes> LayersShapesMap;
1021     typedef std::map<int, LayerData> MapIdToLayerData;
1022
1023     Impl()
1024     {
1025         //allocate fake net input layer
1026         netInputLayer = Ptr<DataLayer>(new DataLayer());
1027         LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
1028         inpl.id = 0;
1029         netInputLayer->name = inpl.name = "_input";
1030         inpl.type = "__NetInputLayer__";
1031         inpl.layerInstance = netInputLayer;
1032         layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
1033
1034         lastLayerId = 0;
1035         netWasAllocated = false;
1036         fusion = true;
1037         isAsync = false;
1038         preferableBackend = DNN_BACKEND_DEFAULT;
1039         preferableTarget = DNN_TARGET_CPU;
1040         skipInfEngineInit = false;
1041     }
1042
1043     Ptr<DataLayer> netInputLayer;
1044     std::vector<LayerPin> blobsToKeep;
1045     MapIdToLayerData layers;
1046     std::map<String, int> layerNameToId;
1047     BlobManager blobManager;
1048     int preferableBackend;
1049     int preferableTarget;
1050     String halideConfigFile;
1051     bool skipInfEngineInit;
1052     // Map host data to backend specific wrapper.
1053     std::map<void*, Ptr<BackendWrapper> > backendWrappers;
1054
1055     int lastLayerId;
1056
1057     bool netWasAllocated;
1058     bool fusion;
1059     bool isAsync;
1060     std::vector<int64> layersTimings;
1061     Mat output_blob;
1062
1063     Ptr<BackendWrapper> wrap(Mat& host)
1064     {
1065         if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
1066             return Ptr<BackendWrapper>();
1067
1068         MatShape shape(host.dims);
1069         for (int i = 0; i < host.dims; ++i)
1070             shape[i] = host.size[i];
1071
1072         void* data = host.data;
1073         if (backendWrappers.find(data) != backendWrappers.end())
1074         {
1075             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
1076             if (preferableBackend == DNN_BACKEND_OPENCV)
1077             {
1078                 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
1079                 return OpenCLBackendWrapper::create(baseBuffer, host);
1080             }
1081             else if (preferableBackend == DNN_BACKEND_HALIDE)
1082             {
1083                 CV_Assert(haveHalide());
1084   #ifdef HAVE_HALIDE
1085                 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
1086   #endif  // HAVE_HALIDE
1087             }
1088             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1089             {
1090                 return wrapMat(preferableBackend, preferableTarget, host);
1091             }
1092             else if (preferableBackend == DNN_BACKEND_VKCOM)
1093             {
1094   #ifdef HAVE_VULKAN
1095                 return Ptr<BackendWrapper>(new VkComBackendWrapper(baseBuffer, host));
1096   #endif
1097             }
1098             else
1099                 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1100         }
1101
1102         Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
1103         backendWrappers[data] = wrapper;
1104         return wrapper;
1105     }
1106
1107 #ifdef HAVE_HALIDE
1108     void compileHalide()
1109     {
1110         CV_TRACE_FUNCTION();
1111
1112         CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
1113
1114         HalideScheduler scheduler(halideConfigFile);
1115         std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
1116         for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
1117         {
1118             LayerData &ld = it->second;
1119             Ptr<Layer> layer = ld.layerInstance;
1120             if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
1121             {
1122                 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
1123                 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
1124                 if (!scheduled)
1125                 {
1126                     // Use automatic scheduling provided by layer.
1127                     layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1128                                                 ld.inputBlobs, ld.outputBlobs,
1129                                                 preferableTarget);
1130                 }
1131                 compileList.emplace_back(ld);
1132             }
1133         }
1134         std::atomic<int> progress(0);
1135         auto fn = ([&] () -> void
1136         {
1137             for (;;)
1138             {
1139                 int id = progress.fetch_add(1);
1140                 if ((size_t)id >= compileList.size())
1141                     return;
1142                 const LayerData& ld = compileList[id].get();
1143                 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1144                 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1145             }
1146         });
1147         size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1148         num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1149         std::vector<std::thread> threads(num_threads - 1);
1150         for (auto& t: threads) t = std::thread(fn);
1151         fn(); // process own tasks
1152         for (auto& t: threads) t.join();
1153     }
1154 #endif
1155
1156     void clear()
1157     {
1158         CV_TRACE_FUNCTION();
1159
1160         MapIdToLayerData::iterator it;
1161         for (it = layers.begin(); it != layers.end(); it++)
1162         {
1163             if (it->second.id != 0) {
1164                 it->second.inputBlobs.clear();
1165                 it->second.outputBlobs.clear();
1166                 it->second.internals.clear();
1167             }
1168             it->second.skip = false;
1169             //it->second.consumers.clear();
1170             Ptr<Layer> currLayer = it->second.layerInstance;
1171
1172             if( currLayer.empty() )
1173                 continue;
1174
1175             currLayer->unsetAttached();
1176         }
1177
1178         layersTimings.clear();
1179     }
1180
1181     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1182     {
1183         CV_TRACE_FUNCTION();
1184
1185         if (preferableBackend == DNN_BACKEND_DEFAULT)
1186             preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1187
1188         CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1189                   preferableTarget == DNN_TARGET_CPU ||
1190                   preferableTarget == DNN_TARGET_OPENCL ||
1191                   preferableTarget == DNN_TARGET_OPENCL_FP16);
1192         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1193                   preferableTarget == DNN_TARGET_CPU ||
1194                   preferableTarget == DNN_TARGET_OPENCL);
1195         CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
1196                   preferableTarget == DNN_TARGET_CPU ||
1197                   preferableTarget == DNN_TARGET_OPENCL ||
1198                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1199                   preferableTarget == DNN_TARGET_MYRIAD ||
1200                   preferableTarget == DNN_TARGET_FPGA);
1201         CV_Assert(preferableBackend != DNN_BACKEND_VKCOM ||
1202                   preferableTarget == DNN_TARGET_VULKAN);
1203         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1204         {
1205             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1206 #ifndef HAVE_OPENCL
1207             {
1208                 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1209                 preferableTarget = DNN_TARGET_CPU;
1210             }
1211 #else
1212             {
1213                 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1214                 {
1215                     // Current implementation is only valid for GPU (#11494)
1216                     if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1217                     {
1218                         CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1219                         preferableTarget = DNN_TARGET_CPU;
1220                     }
1221                     else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1222                     {
1223                         CV_LOG_WARNING(NULL,
1224                             "DNN: OpenCL target with fp16 precision is not supported "
1225                             "with current OpenCL device (tested with Intel GPUs only), "
1226                             "switching to OpenCL with fp32 precision.");
1227                         preferableTarget = DNN_TARGET_OPENCL;
1228                     }
1229                 }
1230             }
1231 #endif
1232             if (preferableBackend == DNN_BACKEND_VKCOM && !haveVulkan())
1233             {
1234                 preferableBackend = DNN_BACKEND_OPENCV;
1235                 preferableTarget = DNN_TARGET_CPU;
1236             }
1237
1238             clear();
1239
1240             allocateLayers(blobsToKeep_);
1241
1242             MapIdToLayerData::iterator it = layers.find(0);
1243             CV_Assert(it != layers.end());
1244             it->second.skip = netInputLayer->skip;
1245
1246             initBackend();
1247
1248             if (!netWasAllocated )
1249             {
1250 #ifdef HAVE_HALIDE
1251                 if (preferableBackend == DNN_BACKEND_HALIDE)
1252                     compileHalide();
1253 #else
1254                 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1255 #endif
1256             }
1257
1258             netWasAllocated = true;
1259             this->blobsToKeep = blobsToKeep_;
1260         }
1261     }
1262
1263     int getLayerId(const String &layerName)
1264     {
1265         std::map<String, int>::iterator it = layerNameToId.find(layerName);
1266         return (it != layerNameToId.end()) ? it->second : -1;
1267     }
1268
1269     int getLayerId(int id)
1270     {
1271         MapIdToLayerData::iterator it = layers.find(id);
1272         return (it != layers.end()) ? id : -1;
1273     }
1274
1275     int getLayerId(DictValue &layerDesc)
1276     {
1277         if (layerDesc.isInt())
1278             return getLayerId(layerDesc.get<int>());
1279         else if (layerDesc.isString())
1280             return getLayerId(layerDesc.get<String>());
1281
1282         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1283         return -1;
1284     }
1285
1286     String getLayerName(int id)
1287     {
1288         MapIdToLayerData::iterator it = layers.find(id);
1289         return (it != layers.end()) ? it->second.name : "(unknown layer)";
1290     }
1291
1292     LayerData& getLayerData(int id)
1293     {
1294         MapIdToLayerData::iterator it = layers.find(id);
1295
1296         if (it == layers.end())
1297             CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1298
1299         return it->second;
1300     }
1301
1302     LayerData& getLayerData(const String &layerName)
1303     {
1304         int id = getLayerId(layerName);
1305
1306         if (id < 0)
1307             CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1308
1309         return getLayerData(id);
1310     }
1311
1312     LayerData& getLayerData(const DictValue &layerDesc)
1313     {
1314         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1315         if (layerDesc.isInt())
1316             return getLayerData(layerDesc.get<int>());
1317         else /*if (layerDesc.isString())*/
1318             return getLayerData(layerDesc.get<String>());
1319     }
1320
1321     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1322     {
1323         if ((int)ld.inputBlobsId.size() <= inNum)
1324         {
1325             ld.inputBlobsId.resize(inNum + 1);
1326         }
1327         else
1328         {
1329             LayerPin storedFrom = ld.inputBlobsId[inNum];
1330             if (storedFrom.valid() && !storedFrom.equal(from))
1331                 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1332                                                  inNum, ld.name.c_str()));
1333         }
1334
1335         ld.inputBlobsId[inNum] = from;
1336     }
1337
1338     int resolvePinOutputName(LayerData &ld, const String &outName)
1339     {
1340         if (outName.empty())
1341             return 0;
1342         return ld.getLayerInstance()->outputNameToIndex(outName);
1343     }
1344
1345     LayerPin getPinByAlias(const String &layerName)
1346     {
1347         LayerPin pin;
1348         pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1349
1350         if (pin.lid >= 0)
1351             pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1352
1353         return pin;
1354     }
1355
1356     std::vector<LayerPin> getLayerOutPins(const String &layerName)
1357     {
1358         int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1359
1360         std::vector<LayerPin> pins;
1361
1362         for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1363         {
1364             pins.push_back(LayerPin(lid, i));
1365         }
1366
1367         return pins;
1368     }
1369
1370     void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1371     {
1372         CV_Assert(outLayerId < inLayerId);
1373         LayerData &ldOut = getLayerData(outLayerId);
1374         LayerData &ldInp = getLayerData(inLayerId);
1375
1376         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1377         ldOut.requiredOutputs.insert(outNum);
1378         ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1379     }
1380
1381     void initBackend()
1382     {
1383         CV_TRACE_FUNCTION();
1384         if (preferableBackend == DNN_BACKEND_OPENCV)
1385             CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1386         else if (preferableBackend == DNN_BACKEND_HALIDE)
1387             initHalideBackend();
1388         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1389             initInfEngineBackend();
1390         else if (preferableBackend == DNN_BACKEND_VKCOM)
1391             initVkComBackend();
1392         else
1393             CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1394     }
1395
1396     void initHalideBackend()
1397     {
1398         CV_TRACE_FUNCTION();
1399         CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1400
1401         // Iterator to current layer.
1402         MapIdToLayerData::iterator it = layers.begin();
1403         // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1404         // it'll be a conv layer.
1405         MapIdToLayerData::iterator baseIt = layers.begin();
1406         for (; it != layers.end(); it++)
1407         {
1408             LayerData &ldTop = it->second;
1409             Ptr<Layer> layerTop = ldTop.layerInstance;
1410             if (!layerTop->supportBackend(preferableBackend))
1411             {
1412                 // Move base iterator to layer that don't support preferable
1413                 // backend to prevent fusion over layer of different backend.
1414                 baseIt = it;
1415                 continue;
1416             }
1417             // Try to do layers fusion.
1418             LayerData &ldBot = baseIt->second;
1419             Ptr<Layer> layerBot = ldBot.layerInstance;
1420             // 1. Check that bottom and top from the same backends.
1421             if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1422             {
1423                 // 2. Check that current layer works in-place.
1424                 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1425                                ldBot.outputBlobs.size() == 1 &&
1426                                ldTop.inputBlobs[0]->data ==
1427                                ldBot.outputBlobs[0].data;
1428                 if (inPlace)
1429                 {
1430                     // 3. Try to attach node.
1431                     CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1432                     Ptr<BackendNode> fusedNode =
1433                         layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1434                     if (!fusedNode.empty())
1435                     {
1436                         ldTop.skip = true;
1437                         ldBot.backendNodes[preferableBackend] = fusedNode;
1438                         ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1439                         continue;
1440                     }
1441                 }
1442             }
1443             // No layers fusion.
1444             ldTop.skip = false;
1445             ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1446                 layerTop->initHalide(ldTop.inputBlobsWrappers);
1447             baseIt = it;
1448         }
1449     }
1450
1451 #ifdef HAVE_INF_ENGINE
1452     // Before launching Inference Engine graph we need to specify output blobs.
1453     // This function requests output blobs based on inputs references of
1454     // layers from default backend or layers from different graphs.
1455     void addInfEngineNetOutputs(LayerData &ld)
1456     {
1457         Ptr<InfEngineBackendNet> layerNet;
1458         if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1459         {
1460             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1461             if (!node.empty())
1462             {
1463                 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1464                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1465                 layerNet = ieNode->net;
1466             }
1467         }
1468         // For an every input reference we check that it belongs to one of
1469         // the Inference Engine backend graphs. Request an output blob if it is.
1470         // Do nothing if layer's input is from the same graph.
1471         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1472         {
1473             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1474             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1475             if (!inpNode.empty())
1476             {
1477                 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1478                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1479                 if (layerNet != ieInpNode->net)
1480                 {
1481                     // layerNet is empty or nodes are from different graphs.
1482                     ieInpNode->net->addOutput(ieInpNode->layer.getName());
1483                 }
1484             }
1485         }
1486     }
1487 #endif  // HAVE_INF_ENGINE
1488
1489     void initVkComBackend()
1490     {
1491         CV_TRACE_FUNCTION();
1492         CV_Assert(preferableBackend == DNN_BACKEND_VKCOM);
1493 #ifdef HAVE_VULKAN
1494         if (!haveVulkan())
1495             return;
1496
1497         MapIdToLayerData::iterator it = layers.begin();
1498         for (; it != layers.end(); it++)
1499         {
1500             LayerData &ld = it->second;
1501             Ptr<Layer> layer = ld.layerInstance;
1502             if (!layer->supportBackend(preferableBackend))
1503             {
1504                 continue;
1505             }
1506
1507             ld.skip = false;
1508
1509             try
1510             {
1511                 ld.backendNodes[DNN_BACKEND_VKCOM] =
1512                     layer->initVkCom(ld.inputBlobsWrappers);
1513             }
1514             catch (const cv::Exception& e)
1515             {
1516                 CV_LOG_ERROR(NULL, "initVkCom failed, fallback to CPU implementation. " << e.what());
1517                 ld.backendNodes[DNN_BACKEND_VKCOM] = Ptr<BackendNode>();
1518             }
1519         }
1520 #endif
1521     }
1522
1523     void initInfEngineBackend()
1524     {
1525         CV_TRACE_FUNCTION();
1526         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE, haveInfEngine());
1527 #ifdef HAVE_INF_ENGINE
1528         MapIdToLayerData::iterator it;
1529         Ptr<InfEngineBackendNet> net;
1530
1531         for (it = layers.begin(); it != layers.end(); ++it)
1532         {
1533             LayerData &ld = it->second;
1534             if (ld.id == 0)
1535             {
1536                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1537                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1538                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1539                 {
1540                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1541 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1542                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1543 #else
1544                     dataPtr->setName(netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i]);
1545 #endif
1546                 }
1547             }
1548             else
1549             {
1550                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1551                 {
1552                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1553 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1554                     dataPtr->name = ld.name;
1555 #else
1556                     dataPtr->setName(ld.name);
1557 #endif
1558                 }
1559             }
1560         }
1561
1562         if (skipInfEngineInit)
1563         {
1564             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1565             CV_Assert(!node.empty());
1566
1567             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1568             CV_Assert(!ieNode.empty());
1569
1570             for (it = layers.begin(); it != layers.end(); ++it)
1571             {
1572                 LayerData &ld = it->second;
1573                 if (ld.id == 0)
1574                 {
1575                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1576                     {
1577                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1578 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1579                         dataPtr->name = netInputLayer->outNames[i];
1580 #else
1581                         dataPtr->setName(netInputLayer->outNames[i]);
1582 #endif
1583                     }
1584                 }
1585                 else
1586                 {
1587                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1588                     {
1589                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1590 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2019010000)
1591                         dataPtr->name = ld.name;
1592 #else
1593                         dataPtr->setName(ld.name);
1594 #endif
1595                     }
1596                 }
1597                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1598                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1599                 ld.skip = true;
1600             }
1601             layers[lastLayerId].skip = false;
1602             ieNode->net->init(preferableTarget);
1603             return;
1604         }
1605
1606         // Build Inference Engine networks from sets of layers that support this
1607         // backend. Split a whole model on several Inference Engine networks if
1608         // some of layers are not implemented.
1609
1610         // Set of all input and output blobs wrappers for current network.
1611         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1612         for (it = layers.begin(); it != layers.end(); ++it)
1613         {
1614             LayerData &ld = it->second;
1615             if (ld.id == 0 && ld.skip)
1616                 continue;
1617             bool fused = ld.skip;
1618
1619             Ptr<Layer> layer = ld.layerInstance;
1620             if (!fused && !layer->supportBackend(preferableBackend))
1621             {
1622                 bool customizable = ld.id != 0 && ld.outputBlobs.size() == 1 &&
1623                                     INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R2);
1624                 // TODO: there is a bug in Myriad plugin with custom layers shape infer.
1625                 if (preferableTarget == DNN_TARGET_MYRIAD)
1626                 {
1627                     for (int i = 0; customizable && i < ld.inputBlobs.size(); ++i)
1628                     {
1629                         customizable = ld.inputBlobs[i]->size[0] == 1;
1630                     }
1631                 }
1632
1633                 // TODO: fix these workarounds
1634                 if (preferableTarget == DNN_TARGET_MYRIAD ||
1635                     preferableTarget == DNN_TARGET_OPENCL ||
1636                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1637                     customizable &= ld.type != "Concat";
1638
1639                 if (preferableTarget == DNN_TARGET_OPENCL ||
1640                     preferableTarget == DNN_TARGET_OPENCL_FP16)
1641                     customizable &= ld.type != "Power";
1642
1643                 if (preferableTarget == DNN_TARGET_OPENCL)
1644                     customizable &= ld.type != "Eltwise";
1645
1646                 if (!customizable)
1647                 {
1648                     addInfEngineNetOutputs(ld);
1649                     net = Ptr<InfEngineBackendNet>();
1650                     netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1651                     layer->preferableTarget = DNN_TARGET_CPU;
1652                     continue;
1653                 }
1654             }
1655             ld.skip = true;  // Initially skip all Inference Engine supported layers.
1656
1657             // Create a new network if one of inputs from different Inference Engine graph.
1658             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1659             {
1660                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1661                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1662                 if (!inpNode.empty())
1663                 {
1664                     Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1665                     CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1666                     if (ieInpNode->net != net)
1667                     {
1668                         net = Ptr<InfEngineBackendNet>();
1669                         netBlobsWrappers.clear();  // Is not used for R5 release but we don't wrap it to #ifdef.
1670                         break;
1671                     }
1672                 }
1673             }
1674
1675             Ptr<BackendNode> node;
1676             if (!net.empty())
1677             {
1678                 if (fused)
1679                 {
1680                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1681                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1682                     CV_Assert(inPlace);
1683                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1684                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1685                 }
1686             }
1687             else
1688                 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1689
1690             if (!fused)
1691             {
1692                 if (layer->supportBackend(preferableBackend))
1693                     node = layer->initInfEngine(ld.inputBlobsWrappers);
1694                 else
1695                 {
1696                     node = Ptr<BackendNode>(new InfEngineBackendNode(
1697                         ld.layerInstance, ld.inputBlobs, ld.outputBlobs, ld.internals));
1698                 }
1699             }
1700             else if (node.empty())
1701                 continue;
1702
1703             CV_Assert(!node.empty());
1704             ld.backendNodes[preferableBackend] = node;
1705
1706             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1707             CV_Assert(!ieNode.empty());
1708             ieNode->net = net;
1709
1710             // Convert weights in FP16 for specific targets.
1711             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1712                  preferableTarget == DNN_TARGET_MYRIAD ||
1713                  preferableTarget == DNN_TARGET_FPGA) && !fused)
1714             {
1715 #if INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2019R1)
1716                 for (const std::string& name : {"weights", "biases"})
1717                 {
1718                     auto it = ieNode->layer.getParameters().find(name);
1719                     if (it != ieNode->layer.getParameters().end())
1720                     {
1721                         InferenceEngine::Blob::Ptr bp = it->second.as<InferenceEngine::Blob::Ptr>();
1722                         it->second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(bp));
1723                     }
1724                 }
1725 #else
1726                 auto& blobs = ieNode->layer.getConstantData();
1727                 if (blobs.empty())
1728                 {
1729                     // In case of non weightable layer we have to specify
1730                     // it's precision adding dummy blob.
1731                     auto blob = InferenceEngine::make_shared_blob<int16_t>(
1732                                     InferenceEngine::Precision::FP16,
1733                                     InferenceEngine::Layout::C, {1});
1734                     blob->allocate();
1735                     blobs[""] = blob;
1736                 }
1737                 else
1738                 {
1739                     for (auto& it : blobs)
1740                         it.second = convertFp16(std::const_pointer_cast<InferenceEngine::Blob>(it.second));
1741                 }
1742 #endif
1743             }
1744
1745             if (!fused)
1746                 net->addLayer(ieNode->layer);
1747
1748             net->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers, ieNode->layer.getName());
1749             net->addBlobs(ld.inputBlobsWrappers);
1750             net->addBlobs(ld.outputBlobsWrappers);
1751             addInfEngineNetOutputs(ld);
1752         }
1753
1754         // Initialize all networks.
1755         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1756         {
1757             LayerData &ld = it->second;
1758             if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1759                 continue;
1760
1761             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1762             if (node.empty())
1763                 continue;
1764
1765             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1766             if (ieNode.empty())
1767                 continue;
1768
1769             CV_Assert(!ieNode->net.empty());
1770
1771             if (!ieNode->net->isInitialized())
1772             {
1773                 ieNode->net->init(preferableTarget);
1774                 ld.skip = false;
1775             }
1776         }
1777 #endif  // HAVE_INF_ENGINE
1778     }
1779
1780     void allocateLayer(int lid, const LayersShapesMap& layersShapes)
1781     {
1782         CV_TRACE_FUNCTION();
1783
1784         LayerData &ld = layers[lid];
1785
1786         //already allocated
1787         if (ld.flag)
1788             return;
1789
1790         size_t ninputs = ld.inputBlobsId.size();
1791 #if 0
1792         printf("layer %s:", ld.name.c_str());
1793         for (size_t i = 0; i < ninputs; i++)
1794         {
1795             int inp_lid = ld.inputBlobsId[i].lid;
1796             LayerData &inp_ld = layers[inp_lid];
1797             int inp_outputs = (int)inp_ld.outputBlobs.size();
1798             std::cout << " " << inp_ld.name << "(" << inp_outputs;
1799
1800             for( int j = 0; j < inp_outputs; j++ )
1801             {
1802                 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
1803             }
1804             std::cout << ")";
1805         }
1806         printf("\n");
1807 #endif
1808
1809         //determine parent layers
1810         for (size_t i = 0; i < ninputs; i++)
1811             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
1812
1813         //allocate parents
1814         for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
1815             allocateLayer(*i, layersShapes);
1816
1817         //bind inputs
1818         if (ld.id == 0)  // DataLayer
1819         {
1820             ninputs = netInputLayer->inputsData.size();
1821             ld.inputBlobsWrappers.resize(ninputs);
1822             for (size_t i = 0; i < ninputs; i++)
1823             {
1824                 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
1825             }
1826         }
1827         else
1828         {
1829             ld.inputBlobs.resize(ninputs);
1830             ld.inputBlobsWrappers.resize(ninputs);
1831             for (size_t i = 0; i < ninputs; i++)
1832             {
1833                 LayerPin from = ld.inputBlobsId[i];
1834                 CV_Assert(from.valid());
1835                 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
1836                 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
1837                 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
1838             }
1839         }
1840
1841         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
1842
1843         CV_Assert(layerShapesIt != layersShapes.end());
1844
1845         std::vector<LayerPin> pinsForInternalBlobs;
1846         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
1847                                           preferableBackend == DNN_BACKEND_OPENCV &&
1848                                           preferableTarget == DNN_TARGET_OPENCL_FP16);
1849         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
1850         for (int i = 0; i < ld.outputBlobs.size(); ++i)
1851         {
1852             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
1853         }
1854         ld.internalBlobsWrappers.resize(ld.internals.size());
1855         for (int i = 0; i < ld.internals.size(); ++i)
1856         {
1857             ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
1858         }
1859
1860         Ptr<Layer> layerPtr = ld.getLayerInstance();
1861         {
1862             std::vector<Mat> inps(ld.inputBlobs.size());
1863             for (int i = 0; i < ld.inputBlobs.size(); ++i)
1864             {
1865                 inps[i] = *ld.inputBlobs[i];
1866             }
1867             layerPtr->finalize(inps, ld.outputBlobs);
1868             layerPtr->preferableTarget = preferableTarget;
1869 #if 0
1870             std::cout << "\toutputs:";
1871             size_t noutputs = ld.outputBlobs.size();
1872             for (size_t j = 0; j < noutputs; j++)
1873             {
1874                 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
1875             }
1876             std::cout << "\n";
1877 #endif
1878         }
1879
1880         // After allocation of layer, we decrease counters to it's input blobs.
1881         blobManager.releaseReferences(ld.inputBlobsId);
1882         blobManager.releaseReferences(pinsForInternalBlobs);
1883
1884         ld.flag = 1;
1885     }
1886
1887 #if 0
1888 #define printf_(args) printf args
1889 #else
1890 #define printf_(args)
1891 #endif
1892
1893     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
1894     {
1895         if( !fusion || (preferableBackend != DNN_BACKEND_OPENCV &&
1896                         preferableBackend != DNN_BACKEND_INFERENCE_ENGINE))
1897             return;
1898
1899         CV_TRACE_FUNCTION();
1900
1901         // scan through all the layers. If there is convolution layer followed by the activation layer,
1902         // we try to embed this activation into the convolution and disable separate execution of the activation
1903         std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
1904                                       blobsToKeep_.end());
1905         MapIdToLayerData::iterator it;
1906         for (it = layers.begin(); it != layers.end(); it++)
1907         {
1908             int lid = it->first;
1909             LayerData& ld = layers[lid];
1910             if( ld.skip )
1911             {
1912                 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1913                 continue;
1914             }
1915             printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1916
1917             // the optimization #1. try to fuse batch norm, scaling and/or activation layers
1918             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
1919             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
1920             // some other layers.
1921             Ptr<Layer>& currLayer = ld.layerInstance;
1922             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
1923             {
1924                 LayerData* nextData = &layers[ld.consumers[0].lid];
1925                 LayerPin lpNext(ld.consumers[0].lid, 0);
1926                 while (nextData)
1927                 {
1928                     Ptr<Layer> nextLayer = nextData->layerInstance;
1929                     if (currLayer->tryFuse(nextLayer))
1930                     {
1931                         printf_(("\tfused with %s\n", nextLayer->name.c_str()));
1932                         nextData->skip = true;
1933                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1934                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1935                         if (nextData->consumers.size() == 1)
1936                         {
1937                             int nextLayerId = nextData->consumers[0].lid;
1938                             nextData = &layers[nextLayerId];
1939                             lpNext = LayerPin(nextLayerId, 0);
1940                         }
1941                         else
1942                         {
1943                             nextData = 0;
1944                             break;
1945                         }
1946                     }
1947                     else
1948                         break;
1949                 }
1950
1951                 if (preferableBackend != DNN_BACKEND_OPENCV)
1952                     continue;  // Go to the next layer.
1953
1954                 // TODO: OpenCL target support more fusion styles.
1955                 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
1956                      (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
1957                      ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
1958                      ld.layerInstance->type != "Concat")) )
1959                     continue;
1960
1961                 while (nextData)
1962                 {
1963                     // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
1964                     if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
1965                         nextData->type != "ReLU" &&
1966                         nextData->type != "ChannelsPReLU" &&
1967                         nextData->type != "ReLU6" &&
1968                         nextData->type != "TanH" &&
1969                         nextData->type != "Power")
1970                         break;
1971
1972                     Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1973                     if (nextActivLayer.empty())
1974                         break;
1975
1976                     if (currLayer->setActivation(nextActivLayer))
1977                     {
1978                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1979                         nextData->skip = true;
1980                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1981                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1982                         if (nextData->consumers.size() == 1)
1983                         {
1984                             int nextLayerId = nextData->consumers[0].lid;
1985                             nextData = &layers[nextLayerId];
1986                             lpNext = LayerPin(nextLayerId, 0);
1987                         }
1988                         else
1989                         {
1990                             nextData = 0;
1991                             break;
1992                         }
1993                     }
1994                     else
1995                         break;
1996                 }
1997
1998                 // fuse convolution layer followed by eltwise + relu
1999                 if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
2000                 {
2001                     Ptr<EltwiseLayer> nextEltwiseLayer;
2002                     if( nextData )
2003                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
2004
2005                     if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2006                         nextData && nextData->inputBlobsId.size() == 2 )
2007                     {
2008                         LayerData *eltwiseData = nextData;
2009
2010                         // Eltwise layer has two inputs. We need to determine which
2011                         // is a base convolution layer and which could be used as it's bias.
2012                         LayerData* biasLayerData = 0;
2013                         for (int i = 0; i < 2; ++i)
2014                         {
2015                             LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
2016                             CV_Assert(downLayerData);
2017                             while (downLayerData->skip)
2018                             {
2019                                 if (downLayerData->inputBlobsId.size() == 1)
2020                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
2021                                 else
2022                                 {
2023                                     downLayerData = 0;
2024                                     break;
2025                                 }
2026                             }
2027                             if (downLayerData && ld.id == downLayerData->id)
2028                             {
2029                                 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
2030                                 break;
2031                             }
2032                         }
2033                         CV_Assert(biasLayerData);
2034                         {
2035                             if( eltwiseData->consumers.size() == 1 )
2036                             {
2037                                 // fuse eltwise + activation layer
2038                                 if (biasLayerData->id < ld.id)
2039                                 {
2040                                     nextData = &layers[eltwiseData->consumers[0].lid];
2041                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
2042                                     Ptr<ActivationLayer> nextActivLayer;
2043                                     if( nextData )
2044                                         nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
2045
2046                                     if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
2047                                             (!nextData->type.compare("ReLU") ||
2048                                              !nextData->type.compare("ChannelsPReLU") ||
2049                                              !nextData->type.compare("Power")) &&
2050                                             currLayer->setActivation(nextActivLayer) )
2051                                     {
2052                                         CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
2053                                         ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
2054                                         printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
2055                                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
2056                                         eltwiseData->skip = true;
2057                                         nextData->skip = true;
2058                                         // This optimization for cases like
2059                                         // some_layer   conv
2060                                         //   |             |
2061                                         //   +-- eltwise --+
2062                                         //          |
2063                                         //        activ
2064                                         // This way all the element-wise computations
2065                                         // (i.e. some_layer+conv or some_layer*conv)
2066                                         // would be done at [conv] layer. So we need to
2067                                         // replace [conv]'s output blob to [eltwise]'s one
2068                                         // considering that [activ] is an in-place layer.
2069                                         // Also we need to move all the consumers' references.
2070                                         // To prevent memory collisions (i.e. when input of
2071                                         // [conv] and output of [eltwise] is the same blob)
2072                                         // we allocate a new blob.
2073                                         CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
2074                                         ld.outputBlobs[0] = ld.outputBlobs[0].clone();
2075                                         ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
2076
2077                                         eltwiseData->outputBlobs = ld.outputBlobs;
2078                                         nextData->outputBlobs = ld.outputBlobs;
2079                                         eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
2080                                         nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
2081
2082                                         // Move references of [activ] layer consumers to the newly allocated blob.
2083                                         for (int i = 0; i < nextData->consumers.size(); ++i)
2084                                         {
2085                                             LayerData& consumer = layers[nextData->consumers[i].lid];
2086                                             for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
2087                                             {
2088                                                 if (consumer.inputBlobsId[j].lid == lpNext.lid)
2089                                                 {
2090                                                     consumer.inputBlobs[j] = &ld.outputBlobs[0];
2091                                                     consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
2092                                                     break;
2093                                                 }
2094                                             }
2095                                         }
2096                                     }
2097                                 }
2098                             }
2099                         }
2100                     }
2101                 }
2102             }
2103
2104             if (preferableBackend != DNN_BACKEND_OPENCV)
2105                 continue;  // Go to the next layer.
2106
2107             // the optimization #2. if there is concat layer that concatenates channels
2108             // from the inputs together (i.e. axis == 1) then we make the inputs of
2109             // the concat layer to write to the concatenation output buffer
2110             // (and so we eliminate the concatenation layer, because the channels
2111             // are concatenated implicitly).
2112             Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
2113             if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
2114                 ld.outputBlobs.size() == 1 )
2115             {
2116                 Mat& output = ld.outputBlobs[0];
2117                 UMat umat_output;
2118                 if (!ld.outputBlobsWrappers.empty() &&
2119                     (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
2120                 {
2121                     size_t i, ninputs = ld.inputBlobsId.size();
2122                     bool conv_layer = true;
2123                     for( i = 0; i < ninputs; i++ )
2124                     {
2125                         LayerPin pin = ld.inputBlobsId[i];
2126                         LayerData* inp_i_data = &layers[pin.lid];
2127                         while(inp_i_data->skip &&
2128                               inp_i_data->inputBlobsId.size() == 1 &&
2129                               inp_i_data->consumers.size() == 1)
2130                         {
2131                             pin = inp_i_data->inputBlobsId[0];
2132                             inp_i_data = &layers[pin.lid];
2133                         }
2134                         conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
2135                     }
2136                     if (!conv_layer)
2137                         continue;
2138                     std::vector<UMat> umat_outputBlobs;
2139                     umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2140                     umat_output = umat_outputBlobs[0];
2141                 }
2142
2143                 // TODO: in general, this optimization can always be done, but
2144                 // many layers currently check that the input/output blobs are
2145                 // continuous arrays. Unfortunately, this is not true when
2146                 // the concatenation optimization is applied with batch_size > 1.
2147                 // so, for now, we only apply this optimization in the most popular
2148                 // case batch_size == 1.
2149                 if( output.dims == 4 && output.size[0] == 1 )
2150                 {
2151                     size_t i, ninputs = ld.inputBlobsId.size();
2152                     std::vector<LayerPin> realinputs(ninputs);
2153                     for( i = 0; i < ninputs; i++ )
2154                     {
2155                         LayerPin pin = ld.inputBlobsId[i];
2156                         LayerData* inp_i_data = &layers[pin.lid];
2157                         while(inp_i_data->skip &&
2158                               inp_i_data->inputBlobsId.size() == 1 &&
2159                               inp_i_data->consumers.size() == 1)
2160                         {
2161                             pin = inp_i_data->inputBlobsId[0];
2162                             inp_i_data = &layers[pin.lid];
2163                         }
2164                         printf_(("\treal input for %s is %s\n",
2165                                layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
2166                                inp_i_data->getLayerInstance()->name.c_str()));
2167
2168                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
2169                             break;
2170                         realinputs[i] = pin;
2171                     }
2172
2173                     if( i >= ninputs )
2174                     {
2175                         // Allocate new memory to prevent collisions during memory
2176                         // reusing (see https://github.com/opencv/opencv/pull/10456).
2177                         output = output.clone();
2178                         if (preferableBackend == DNN_BACKEND_OPENCV &&
2179                             IS_DNN_OPENCL_TARGET(preferableTarget))
2180                         {
2181                             std::vector<UMat> umats(1);
2182                             umat_output = umat_output.clone();
2183                             umats[0] = umat_output;
2184                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
2185                         }
2186                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
2187                         int ofs = 0;
2188                         for( i = 0; i < ninputs; i++ )
2189                         {
2190                             LayerPin pin = realinputs[i];
2191                             LayerData* inp_i_data = &layers[pin.lid];
2192                             int channels_i = ld.inputBlobs[i]->size[1];
2193                             chrange[1] = Range(ofs, ofs + channels_i);
2194                             printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
2195                                    pin.oid, ofs, ofs + channels_i));
2196                             ofs += channels_i;
2197                             Mat output_slice = output(chrange);
2198                             Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
2199                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
2200                             Mat* oldPtr = &curr_output;
2201                             curr_output = output_slice;
2202                             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2203                             {
2204                                 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
2205                                 umats[pin.oid] = umat_output(chrange);
2206                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
2207                             }
2208                             // Layers that refer old input Mat will refer to the
2209                             // new data but the same Mat object.
2210                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
2211                         }
2212                         ld.skip = true;
2213                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
2214                     }
2215                 }
2216             }
2217         }
2218     }
2219
2220     void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
2221     {
2222         CV_TRACE_FUNCTION();
2223
2224         MapIdToLayerData::iterator it;
2225         for (it = layers.begin(); it != layers.end(); it++)
2226             it->second.flag = 0;
2227
2228         CV_Assert(!layers[0].outputBlobs.empty());
2229         ShapesVec inputShapes;
2230         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
2231         {
2232             Mat& inp = layers[0].outputBlobs[i];
2233             CV_Assert(inp.total());
2234             if (preferableBackend == DNN_BACKEND_OPENCV &&
2235                 preferableTarget == DNN_TARGET_OPENCL_FP16)
2236             {
2237                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
2238             }
2239             inputShapes.push_back(shape(inp));
2240         }
2241         LayersShapesMap layersShapes;
2242         getLayersShapes(inputShapes, layersShapes);
2243
2244         blobManager.reset();
2245         backendWrappers.clear();
2246         // Fake references to input blobs.
2247         for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
2248             blobManager.addReference(LayerPin(0, i));
2249         for (it = layers.begin(); it != layers.end(); ++it)
2250         {
2251             const LayerData& ld = it->second;
2252             blobManager.addReferences(ld.inputBlobsId);
2253         }
2254
2255         for (int i = 0; i < blobsToKeep_.size(); i++)
2256         {
2257             blobManager.addReference(blobsToKeep_[i]);
2258         }
2259
2260         for (it = layers.begin(); it != layers.end(); it++)
2261         {
2262             int lid = it->first;
2263             allocateLayer(lid, layersShapes);
2264         }
2265
2266         layersTimings.resize(lastLayerId + 1, 0);
2267         fuseLayers(blobsToKeep_);
2268     }
2269
2270     void forwardLayer(LayerData &ld)
2271     {
2272         CV_TRACE_FUNCTION();
2273
2274         Ptr<Layer> layer = ld.layerInstance;
2275
2276         TickMeter tm;
2277         tm.start();
2278
2279         if( !ld.skip )
2280         {
2281             std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
2282             if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
2283             {
2284                 if (isAsync)
2285                     CV_Error(Error::StsNotImplemented, "Default implementation fallbacks in asynchronous mode");
2286
2287                 if (!layer->supportBackend(DNN_BACKEND_OPENCV))
2288                     CV_Error(Error::StsNotImplemented, format("Layer \"%s\" of type \"%s\" unsupported on OpenCV backend",
2289                                                        ld.name.c_str(), ld.type.c_str()));
2290
2291                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2292                 {
2293                     std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
2294                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2295                     std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
2296                     layer->forward(umat_inputBlobs,
2297                                    umat_outputBlobs,
2298                                    umat_internalBlobs);
2299                     if (DNN_CHECK_NAN_INF)
2300                     {
2301                         bool fail = false;
2302                         for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2303                         {
2304                             UMat& u = umat_outputBlobs[i];
2305                             Mat m;
2306                             if (u.depth() == CV_16S) // FP16
2307                                 convertFp16(u, m);
2308                             else
2309                                 m = u.getMat(ACCESS_READ);
2310                             if (!checkRange(m))
2311                             {
2312                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2313                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2314                                 fail = true;
2315                             }
2316                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2317                             {
2318                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2319                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2320                                 fail = true;
2321                             }
2322                         }
2323                         if (fail)
2324                         {
2325                             for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
2326                             {
2327                                 UMat& u = umat_inputBlobs[i];
2328                                 Mat m;
2329                                 if (u.depth() == CV_16S) // FP16
2330                                     convertFp16(u, m);
2331                                 else
2332                                     m = u.getMat(ACCESS_READ);
2333                                 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2334                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2335                             }
2336                             for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2337                             {
2338                                 UMat& u = umat_outputBlobs[i];
2339                                 Mat m;
2340                                 if (u.depth() == CV_16S) // FP16
2341                                     convertFp16(u, m);
2342                                 else
2343                                     m = u.getMat(ACCESS_READ);
2344                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2345                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2346                             }
2347                             for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
2348                             {
2349                                 UMat& u = umat_internalBlobs[i];
2350                                 Mat m;
2351                                 if (u.depth() == CV_16S) // FP16
2352                                     convertFp16(u, m);
2353                                 else
2354                                     m = u.getMat(ACCESS_READ);
2355                                 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
2356                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
2357                             }
2358                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2359                                 CV_Assert(!fail);
2360                         }
2361                     }
2362                     OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
2363                 }
2364                 else
2365                 {
2366                     for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
2367                     {
2368                         if (!ld.inputBlobsWrappers[i].empty())
2369                             ld.inputBlobsWrappers[i]->copyToHost();
2370                     }
2371
2372                     std::vector<Mat> inps(ld.inputBlobs.size());
2373                     for (int i = 0; i < ld.inputBlobs.size(); ++i)
2374                     {
2375                         inps[i] = *ld.inputBlobs[i];
2376                     }
2377                     layer->forward(inps, ld.outputBlobs, ld.internals);
2378
2379                     if (DNN_CHECK_NAN_INF)
2380                     {
2381                         bool fail = false;
2382                         for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2383                         {
2384                             const Mat& m = ld.outputBlobs[i];
2385                             if (!checkRange(m))
2386                             {
2387                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2388                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2389                                 fail = true;
2390                             }
2391                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2392                             {
2393                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2394                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2395                                 fail = true;
2396                             }
2397                         }
2398                         if (fail)
2399                         {
2400                             for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
2401                             {
2402                                 const Mat* pM = ld.inputBlobs[i];
2403                                 if (!pM)
2404                                 {
2405                                     std::cout << "INPUT " << i << " is NULL" << std::endl;
2406                                     continue;
2407                                 }
2408                                 const Mat& m = *pM;
2409                                 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2410                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2411                             }
2412                             for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2413                             {
2414                                 const Mat& m = ld.outputBlobs[i];
2415                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2416                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2417                             }
2418                             for (size_t i = 0; i < ld.internals.size(); ++i)
2419                             {
2420                                 const Mat& m = ld.internals[i];
2421                                 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2422                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2423                             }
2424                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2425                                 CV_Assert(!fail);
2426                         }
2427                     }
2428
2429                     for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
2430                     {
2431                         if (!ld.outputBlobsWrappers[i].empty())
2432                             ld.outputBlobsWrappers[i]->setHostDirty();
2433                     }
2434                 }
2435             }
2436             else
2437             {
2438                 Ptr<BackendNode> node = it->second;
2439                 CV_Assert(!node.empty());
2440                 if (preferableBackend == DNN_BACKEND_HALIDE)
2441                 {
2442                     forwardHalide(ld.outputBlobsWrappers, node);
2443                 }
2444                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
2445                 {
2446                     forwardInfEngine(ld.outputBlobsWrappers, node, isAsync);
2447                 }
2448                 else if (preferableBackend == DNN_BACKEND_VKCOM)
2449                 {
2450                     try
2451                     {
2452                         forwardVkCom(ld.outputBlobsWrappers, node);
2453                     }
2454                     catch (const cv::Exception& e)
2455                     {
2456                         CV_LOG_ERROR(NULL, "forwardVkCom failed, fallback to CPU implementation. " << e.what());
2457                         it->second = Ptr<BackendNode>();
2458                         forwardLayer(ld);
2459                     }
2460                 }
2461                 else
2462                 {
2463                     CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
2464                 }
2465             }
2466         }
2467         else
2468             tm.reset();
2469
2470         tm.stop();
2471         layersTimings[ld.id] = tm.getTimeTicks();
2472
2473         ld.flag = 1;
2474     }
2475
2476     void forwardToLayer(LayerData &ld, bool clearFlags = true)
2477     {
2478         CV_TRACE_FUNCTION();
2479
2480         if (clearFlags)
2481         {
2482             MapIdToLayerData::iterator it;
2483             for (it = layers.begin(); it != layers.end(); it++)
2484                 it->second.flag = 0;
2485         }
2486
2487         //already was forwarded
2488         if (ld.flag)
2489             return;
2490
2491         //forward parents
2492         MapIdToLayerData::iterator it;
2493         for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
2494         {
2495             LayerData &ld = it->second;
2496             if (ld.flag)
2497                 continue;
2498             forwardLayer(ld);
2499         }
2500
2501         //forward itself
2502         forwardLayer(ld);
2503     }
2504
2505     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
2506     {
2507         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
2508
2509         if (id == 0 && inOutShapes[id].in[0].empty())
2510         {
2511             if (!layers[0].outputBlobs.empty())
2512             {
2513                 ShapesVec shapes;
2514                 for (int i = 0; i < layers[0].outputBlobs.size(); i++)
2515                 {
2516                     Mat& inp = layers[0].outputBlobs[i];
2517                     CV_Assert(inp.total());
2518                     shapes.push_back(shape(inp));
2519                 }
2520                 inOutShapes[0].in = shapes;
2521             }
2522             else
2523             {
2524                 inOutShapes[0].out.clear();
2525                 return;
2526             }
2527         }
2528
2529         if (inOutShapes[id].in.empty())
2530         {
2531             for(int i = 0; i < inputLayerIds.size(); i++)
2532             {
2533                 int layerId = inputLayerIds[i].lid;
2534                 LayersShapesMap::iterator it =
2535                         inOutShapes.find(layerId);
2536                 if(it == inOutShapes.end() ||
2537                         it->second.out.empty())
2538                 {
2539                     getLayerShapesRecursively(layerId, inOutShapes);
2540                 }
2541                 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
2542                 inOutShapes[id].in.push_back(shape);
2543             }
2544         }
2545         const ShapesVec& is = inOutShapes[id].in;
2546         ShapesVec& os = inOutShapes[id].out;
2547         ShapesVec& ints = inOutShapes[id].internal;
2548         int requiredOutputs = layers[id].requiredOutputs.size();
2549         inOutShapes[id].supportInPlace =
2550                 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
2551
2552         for (int i = 0; i < ints.size(); i++)
2553             CV_Assert(total(ints[i]) > 0);
2554
2555         for (int i = 0; i < os.size(); i++)
2556             CV_Assert(total(os[i]) > 0);
2557     }
2558
2559     void getLayersShapes(const ShapesVec& netInputShapes,
2560                          LayersShapesMap& inOutShapes)
2561     {
2562         inOutShapes.clear();
2563
2564         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2565         for (MapIdToLayerData::iterator it = layers.begin();
2566              it != layers.end(); it++)
2567         {
2568             getLayerShapesRecursively(it->first, inOutShapes);
2569         }
2570     }
2571
2572     void getLayerShapes(const ShapesVec& netInputShapes,
2573                         const int layerId,
2574                         LayerShapes& shapes)
2575     {
2576         LayersShapesMap inOutShapes;
2577         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2578         getLayerShapesRecursively(layerId, inOutShapes);
2579         shapes = inOutShapes[layerId];
2580     }
2581
2582     LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
2583     {
2584         return *std::max_element(pins.begin(), pins.end());
2585     }
2586
2587     Mat getBlob(const LayerPin& pin)
2588     {
2589         CV_TRACE_FUNCTION();
2590
2591         if (!pin.valid())
2592             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2593
2594         LayerData &ld = layers[pin.lid];
2595         if ((size_t)pin.oid >= ld.outputBlobs.size())
2596         {
2597             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %zu outputs, "
2598                                            "the #%d was requested", ld.name.c_str(),
2599                                            ld.outputBlobs.size(), pin.oid));
2600         }
2601         if (preferableTarget != DNN_TARGET_CPU)
2602         {
2603             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2604             // Transfer data to CPU if it's require.
2605             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2606         }
2607
2608         if (ld.outputBlobs[pin.oid].depth() == CV_16S)
2609         {
2610             convertFp16(ld.outputBlobs[pin.oid], output_blob);
2611             return output_blob;
2612         }
2613         else
2614             return ld.outputBlobs[pin.oid];
2615     }
2616
2617     Mat getBlob(String outputName)
2618     {
2619         return getBlob(getPinByAlias(outputName));
2620     }
2621
2622 #ifdef CV_CXX11
2623     AsyncArray getBlobAsync(const LayerPin& pin)
2624     {
2625         CV_TRACE_FUNCTION();
2626 #ifdef HAVE_INF_ENGINE
2627         if (!pin.valid())
2628             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2629
2630         LayerData &ld = layers[pin.lid];
2631         if ((size_t)pin.oid >= ld.outputBlobs.size())
2632         {
2633             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
2634                                            "the #%d was requested", ld.name.c_str(),
2635                                            (int)ld.outputBlobs.size(), (int)pin.oid));
2636         }
2637         if (preferableTarget != DNN_TARGET_CPU)
2638         {
2639             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2640             // Transfer data to CPU if it's require.
2641             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2642         }
2643         CV_Assert(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE);
2644
2645         Ptr<InfEngineBackendWrapper> wrapper = ld.outputBlobsWrappers[pin.oid].dynamicCast<InfEngineBackendWrapper>();
2646         return std::move(wrapper->futureMat);
2647 #else
2648         CV_Error(Error::StsNotImplemented, "DNN_BACKEND_INFERENCE_ENGINE backend is required");
2649 #endif
2650     }
2651
2652     AsyncArray getBlobAsync(String outputName)
2653     {
2654         return getBlobAsync(getPinByAlias(outputName));
2655     }
2656 #endif  // CV_CXX11
2657 };
2658
2659 Net::Net() : impl(new Net::Impl)
2660 {
2661 }
2662
2663 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
2664 {
2665 #ifndef HAVE_INF_ENGINE
2666     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
2667 #else
2668     InferenceEngine::CNNNetReader reader;
2669     reader.ReadNetwork(xml);
2670     reader.ReadWeights(bin);
2671
2672     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
2673
2674     std::vector<String> inputsNames;
2675     std::vector<MatShape> inp_shapes;
2676     for (auto& it : ieNet.getInputsInfo())
2677     {
2678         inputsNames.push_back(it.first);
2679         std::vector<size_t> dims = it.second->getTensorDesc().getDims();
2680         inp_shapes.push_back(std::vector<int>(dims.begin(), dims.end()));
2681     }
2682
2683     Net cvNet;
2684     cvNet.setInputsNames(inputsNames);
2685
2686     // set empty input to determine input shapes
2687     for (int inp_id = 0; inp_id < inputsNames.size(); ++inp_id)
2688     {
2689         cvNet.setInput(Mat(inp_shapes[inp_id], CV_32F), inputsNames[inp_id]);
2690     }
2691
2692     Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(InferenceEngine::Builder::Layer("")));
2693     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
2694     for (auto& it : ieNet.getOutputsInfo())
2695     {
2696         Ptr<Layer> cvLayer(new InfEngineBackendLayer(ieNet));
2697         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
2698         CV_Assert(ieLayer);
2699
2700         LayerParams lp;
2701         int lid = cvNet.addLayer(it.first, "", lp);
2702
2703         LayerData& ld = cvNet.impl->layers[lid];
2704         cvLayer->name = it.first;
2705         cvLayer->type = ieLayer->type;
2706         ld.layerInstance = cvLayer;
2707         ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
2708
2709         for (int i = 0; i < inputsNames.size(); ++i)
2710             cvNet.connect(0, i, lid, i);
2711     }
2712     cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
2713
2714     cvNet.impl->skipInfEngineInit = true;
2715     return cvNet;
2716 #endif  // HAVE_INF_ENGINE
2717 }
2718
2719 Net::~Net()
2720 {
2721 }
2722
2723 int Net::addLayer(const String &name, const String &type, LayerParams &params)
2724 {
2725     CV_TRACE_FUNCTION();
2726
2727     if (impl->getLayerId(name) >= 0)
2728     {
2729         CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
2730         return -1;
2731     }
2732
2733     int id = ++impl->lastLayerId;
2734     impl->layerNameToId.insert(std::make_pair(name, id));
2735     impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
2736
2737     return id;
2738 }
2739
2740 int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
2741 {
2742     CV_TRACE_FUNCTION();
2743
2744     int prvLid = impl->lastLayerId;
2745     int newLid = this->addLayer(name, type, params);
2746     this->connect(prvLid, 0, newLid, 0);
2747     return newLid;
2748 }
2749
2750 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
2751 {
2752     CV_TRACE_FUNCTION();
2753
2754     impl->connect(outLayerId, outNum, inpLayerId, inpNum);
2755 }
2756
2757 void Net::connect(String _outPin, String _inPin)
2758 {
2759     CV_TRACE_FUNCTION();
2760
2761     LayerPin outPin = impl->getPinByAlias(_outPin);
2762     LayerPin inpPin = impl->getPinByAlias(_inPin);
2763
2764     CV_Assert(outPin.valid() && inpPin.valid());
2765
2766     impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
2767 }
2768
2769 Mat Net::forward(const String& outputName)
2770 {
2771     CV_TRACE_FUNCTION();
2772
2773     String layerName = outputName;
2774
2775     if (layerName.empty())
2776         layerName = getLayerNames().back();
2777
2778     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2779     impl->setUpNet(pins);
2780     impl->forwardToLayer(impl->getLayerData(layerName));
2781
2782     return impl->getBlob(layerName);
2783 }
2784
2785 AsyncArray Net::forwardAsync(const String& outputName)
2786 {
2787     CV_TRACE_FUNCTION();
2788 #ifdef CV_CXX11
2789     String layerName = outputName;
2790
2791     if (layerName.empty())
2792         layerName = getLayerNames().back();
2793
2794     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2795     impl->setUpNet(pins);
2796
2797     if (impl->preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
2798         CV_Error(Error::StsNotImplemented, "Asynchronous forward for backend which is different from DNN_BACKEND_INFERENCE_ENGINE");
2799
2800     impl->isAsync = true;
2801     impl->forwardToLayer(impl->getLayerData(layerName));
2802     impl->isAsync = false;
2803
2804     return impl->getBlobAsync(layerName);
2805 #else
2806     CV_Error(Error::StsNotImplemented, "Asynchronous forward without C++11");
2807 #endif  // CV_CXX11
2808 }
2809
2810 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
2811 {
2812     CV_TRACE_FUNCTION();
2813
2814     String layerName = outputName;
2815
2816     if (layerName.empty())
2817         layerName = getLayerNames().back();
2818
2819     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2820     impl->setUpNet(pins);
2821     impl->forwardToLayer(impl->getLayerData(layerName));
2822
2823     LayerPin pin = impl->getPinByAlias(layerName);
2824     LayerData &ld = impl->layers[pin.lid];
2825
2826     if (outputBlobs.isUMat())
2827     {
2828         impl->getBlob(layerName).copyTo(outputBlobs);
2829     }
2830     else if (outputBlobs.isMat())
2831     {
2832         outputBlobs.assign(impl->getBlob(layerName));
2833     }
2834     else if (outputBlobs.isMatVector())
2835     {
2836         if (impl->preferableTarget != DNN_TARGET_CPU)
2837         {
2838             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2839             {
2840                 CV_Assert(!ld.outputBlobsWrappers[i].empty());
2841                 ld.outputBlobsWrappers[i]->copyToHost();
2842             }
2843         }
2844         if (ld.outputBlobs[0].depth() == CV_32F)
2845         {
2846             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2847             outputvec = ld.outputBlobs;
2848         } else {
2849             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2850             outputvec.resize(ld.outputBlobs.size());
2851             for (int i = 0; i < outputvec.size(); i++)
2852                 convertFp16(ld.outputBlobs[i], outputvec[i]);
2853         }
2854     }
2855     else if (outputBlobs.isUMatVector())
2856     {
2857         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
2858
2859         if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
2860             IS_DNN_OPENCL_TARGET(impl->preferableTarget))
2861         {
2862             if (impl->preferableTarget == DNN_TARGET_OPENCL)
2863                 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2864             else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
2865             {
2866                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2867                 outputvec.resize(out_vec.size());
2868                 for (int i = 0; i < out_vec.size(); i++)
2869                     convertFp16(out_vec[i], outputvec[i]);
2870             }
2871         }
2872         else
2873         {
2874             outputvec.resize(ld.outputBlobs.size());
2875             for (int i = 0; i < outputvec.size(); ++i)
2876                 ld.outputBlobs[i].copyTo(outputvec[i]);
2877         }
2878     }
2879 }
2880
2881 void Net::forward(OutputArrayOfArrays outputBlobs,
2882                   const std::vector<String>& outBlobNames)
2883 {
2884     CV_TRACE_FUNCTION();
2885
2886     std::vector<LayerPin> pins;
2887     for (int i = 0; i < outBlobNames.size(); i++)
2888     {
2889         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
2890     }
2891
2892     impl->setUpNet(pins);
2893
2894     LayerPin out = impl->getLatestLayerPin(pins);
2895
2896     impl->forwardToLayer(impl->getLayerData(out.lid));
2897
2898     std::vector<Mat> matvec;
2899     for (int i = 0; i < pins.size(); i++)
2900     {
2901         matvec.push_back(impl->getBlob(pins[i]));
2902     }
2903
2904     std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2905     outputvec = matvec;
2906 }
2907
2908 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
2909                      const std::vector<String>& outBlobNames)
2910 {
2911     CV_TRACE_FUNCTION();
2912
2913     std::vector<LayerPin> pins;
2914     for (int i = 0; i < outBlobNames.size(); i++)
2915     {
2916         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
2917     }
2918
2919     impl->setUpNet(pins);
2920
2921     LayerPin out = impl->getLatestLayerPin(pins);
2922
2923     impl->forwardToLayer(impl->getLayerData(out.lid));
2924
2925     outputBlobs.resize(outBlobNames.size());
2926     for (int i = 0; i < outBlobNames.size(); i++)
2927     {
2928         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
2929         outputBlobs[i].resize(lp.size());
2930         for (int j = 0; j < lp.size(); j++)
2931         {
2932             outputBlobs[i][j] = impl->getBlob(lp[j]);
2933         }
2934     }
2935 }
2936
2937 void Net::setPreferableBackend(int backendId)
2938 {
2939     CV_TRACE_FUNCTION();
2940     CV_TRACE_ARG(backendId);
2941
2942     if( impl->preferableBackend != backendId )
2943     {
2944         impl->preferableBackend = backendId;
2945         impl->netWasAllocated = false;
2946         impl->clear();
2947     }
2948 }
2949
2950 void Net::setPreferableTarget(int targetId)
2951 {
2952     CV_TRACE_FUNCTION();
2953     CV_TRACE_ARG(targetId);
2954
2955     if( impl->preferableTarget != targetId )
2956     {
2957         impl->preferableTarget = targetId;
2958         if (IS_DNN_OPENCL_TARGET(targetId))
2959         {
2960 #ifndef HAVE_OPENCL
2961 #ifdef HAVE_INF_ENGINE
2962             if (impl->preferableBackend == DNN_BACKEND_OPENCV)
2963 #else
2964             if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
2965                 impl->preferableBackend == DNN_BACKEND_OPENCV)
2966 #endif  // HAVE_INF_ENGINE
2967                 impl->preferableTarget = DNN_TARGET_CPU;
2968 #else
2969             bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
2970             if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
2971                 impl->preferableTarget = DNN_TARGET_OPENCL;
2972 #endif
2973         }
2974         impl->netWasAllocated = false;
2975         impl->clear();
2976     }
2977 }
2978
2979 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
2980 {
2981     CV_TRACE_FUNCTION();
2982
2983     impl->netInputLayer->setNames(inputBlobNames);
2984 }
2985
2986 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
2987 {
2988     CV_TRACE_FUNCTION();
2989     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
2990
2991     LayerPin pin;
2992     pin.lid = 0;
2993     pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
2994
2995     if (!pin.valid())
2996         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
2997
2998     LayerData &ld = impl->layers[pin.lid];
2999     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
3000     ld.outputBlobs.resize(numInputs);
3001     ld.outputBlobsWrappers.resize(numInputs);
3002     impl->netInputLayer->inputsData.resize(numInputs);
3003     impl->netInputLayer->scaleFactors.resize(numInputs);
3004     impl->netInputLayer->means.resize(numInputs);
3005
3006     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
3007     Mat blob_ = blob.getMat();
3008     bool oldShape = prevShape == shape(blob_);
3009     if (oldShape)
3010     {
3011         blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
3012     }
3013     else
3014     {
3015         ld.outputBlobs[pin.oid] = blob_.clone();
3016         impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
3017     }
3018
3019     if (!ld.outputBlobsWrappers[pin.oid].empty())
3020     {
3021         ld.outputBlobsWrappers[pin.oid]->setHostDirty();
3022     }
3023     impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
3024     impl->netInputLayer->means[pin.oid] = mean;
3025     impl->netWasAllocated = impl->netWasAllocated && oldShape;
3026 }
3027
3028 Mat Net::getParam(LayerId layer, int numParam)
3029 {
3030     LayerData &ld = impl->getLayerData(layer);
3031     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3032     CV_Assert(numParam < (int)layerBlobs.size());
3033     return layerBlobs[numParam];
3034 }
3035
3036 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
3037 {
3038     LayerData &ld = impl->getLayerData(layer);
3039
3040     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
3041     CV_Assert(numParam < (int)layerBlobs.size());
3042     //we don't make strong checks, use this function carefully
3043     layerBlobs[numParam] = blob;
3044 }
3045
3046 int Net::getLayerId(const String &layer)
3047 {
3048     return impl->getLayerId(layer);
3049 }
3050
3051 String parseLayerParams(const String& name, const LayerParams& lp) {
3052     DictValue param = lp.get(name);
3053     std::ostringstream out;
3054     out << name << " ";
3055     switch (param.size()) {
3056         case 1: out << ": "; break;
3057         case 2: out << "(HxW): "; break;
3058         case 3: out << "(DxHxW): "; break;
3059         default: CV_Error(Error::StsNotImplemented, format("Unsupported %s size = %d", name.c_str(), param.size()));
3060     }
3061     for (size_t i = 0; i < param.size() - 1; i++) {
3062         out << param.get<int>(i) << " x ";
3063     }
3064     out << param.get<int>(param.size() - 1) << "\\l";
3065     return out.str();
3066 }
3067
3068 String Net::dump()
3069 {
3070     CV_Assert(!empty());
3071
3072     if (impl->netInputLayer->inputsData.empty())
3073         CV_Error(Error::StsError, "Requested set input");
3074
3075     if (!impl->netWasAllocated)
3076         impl->setUpNet();
3077
3078     std::ostringstream out;
3079     std::map<int, LayerData>& map = impl->layers;
3080     int prefBackend = impl->preferableBackend;
3081     std::vector<std::vector<int> > skippedLayers;
3082     std::vector<int> skipId;
3083     std::vector<int> allLayers(map.size(), -1);
3084     int idPrev = -1;
3085     Ptr<BackendNode> prevNode;
3086     for (std::map<int, LayerData>::reverse_iterator rit = map.rbegin(); rit != map.rend(); ++rit)
3087     {
3088         std::map<int, Ptr<BackendNode> >::iterator itBackend = rit->second.backendNodes.find(prefBackend);
3089         if (prefBackend == DNN_BACKEND_OPENCV || itBackend == rit->second.backendNodes.end() ||
3090             itBackend->second.empty())
3091         {
3092                 if (rit->second.skip)
3093                     skipId.push_back(rit->first);
3094                 else if (!skipId.empty())
3095                 {
3096                     if (prefBackend == DNN_BACKEND_OPENCV || prevNode.empty())
3097                         skipId.push_back(rit->first);
3098                     else if (idPrev != -1)
3099                         skipId.push_back(idPrev);
3100
3101                     std::sort(skipId.begin(), skipId.end());
3102                     for (int i = 0; i < skipId.size(); i++) {
3103                         allLayers[skipId[i]] = skippedLayers.size();
3104                     }
3105                     skippedLayers.push_back(skipId);
3106                     skipId.clear();
3107                 }
3108         }
3109         else
3110         {
3111             if (itBackend->second == prevNode)
3112                 skipId.push_back(idPrev);
3113             else if (!skipId.empty())
3114             {
3115                 skipId.push_back(idPrev);
3116                 std::sort(skipId.begin(), skipId.end());
3117                 for (int i = 0; i < skipId.size(); i++) {
3118                     allLayers[skipId[i]] = skippedLayers.size();
3119                 }
3120                 skippedLayers.push_back(skipId);
3121                 skipId.clear();
3122             }
3123             idPrev = rit->first;
3124             prevNode = itBackend->second;
3125         }
3126     }
3127     String colors[] = {"#ffffb3", "#fccde5", "#8dd3c7", "#bebada", "#80b1d3", "#fdb462"};
3128     String backend;
3129     switch (prefBackend) {
3130         case DNN_BACKEND_DEFAULT: backend = "DEFAULT/"; break;
3131         case DNN_BACKEND_HALIDE: backend = "HALIDE/"; break;
3132         case DNN_BACKEND_INFERENCE_ENGINE: backend = "DLIE/"; break;
3133         case DNN_BACKEND_OPENCV: backend = "OCV/"; break;
3134     }
3135     out << "digraph G {" << '\n';
3136     // Add nodes
3137     for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3138     {
3139         String name = it->second.params.name;
3140         if (allLayers[it->first] == -1 && !name.empty()) {
3141             out << "    " << "\"" << name << "\"" << " [label=\"";
3142             skipId.clear();
3143             skipId.push_back(it->first);
3144         }
3145         else if (name.empty() || it->first != skippedLayers[allLayers[it->first]][0])
3146             continue;
3147         else { // first node in cluster : it->first == skippedLayers[allLayers[it->first]][0]
3148             int cluster = allLayers[it->first];
3149             out << "    " << "\"" << "cluster_" << cluster << "\"" << " [label=\"{";
3150             skipId = skippedLayers[allLayers[it->first]]; // vertices in current cluster
3151         }
3152         for (int i = 0; i < skipId.size(); i++)
3153         {
3154             LayerParams& lp = map[skipId[i]].params;
3155             if (!lp.name.empty()) {
3156                 if (i > 0) {
3157                     out << " | ";
3158                 }
3159                 out << lp.name << "\\n" << lp.type << "\\n";
3160                 if (lp.has("kernel_size")) {
3161                     String kernel = parseLayerParams("kernel_size", lp);
3162                     out << kernel;
3163                 } else if (lp.has("kernel_h") && lp.has("kernel_w")) {
3164                     DictValue h = lp.get("kernel_h");
3165                     DictValue w = lp.get("kernel_w");
3166                     out << "kernel (HxW): " << h << " x " << w << "\\l";
3167                 }
3168                 if (lp.has("stride")) {
3169                     String stride = parseLayerParams("stride", lp);
3170                     out << stride;
3171                 } else if (lp.has("stride_h") && lp.has("stride_w")) {
3172                     DictValue h = lp.get("stride_h");
3173                     DictValue w = lp.get("stride_w");
3174                     out << "stride (HxW): " << h << " x " << w << "\\l";
3175                 }
3176                 if (lp.has("dilation")) {
3177                     String dilation = parseLayerParams("dilation", lp);
3178                     out << dilation;
3179                 } else if (lp.has("dilation_h") && lp.has("dilation_w")) {
3180                     DictValue h = lp.get("dilation_h");
3181                     DictValue w = lp.get("dilation_w");
3182                     out << "dilation (HxW): " << h << " x " << w << "\\l";
3183                 }
3184                 if (lp.has("pad")) {
3185                     DictValue pad = lp.get("pad");
3186                     out << "pad ";
3187                     switch (pad.size()) {
3188                         case 1: out << ": " << pad << "\\l"; break;
3189                         case 2: out << "(HxW): (" << pad.get<int>(0) << " x " << pad.get<int>(1) << ")" << "\\l"; break;
3190                         case 4: out << "(HxW): (" << pad.get<int>(0) << ", " << pad.get<int>(2) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(3) << ")" << "\\l"; break;
3191                         case 6: out << "(DxHxW): (" << pad.get<int>(0) << ", " << pad.get<int>(3) << ") x (" << pad.get<int>(1) << ", " << pad.get<int>(4)
3192                                 << ") x (" << pad.get<int>(2) << ", " << pad.get<int>(5) << ")" << "\\l"; break;
3193                         default: CV_Error(Error::StsNotImplemented,  format("Unsupported pad size = %d", pad.size()));
3194                     }
3195                  } else if (lp.has("pad_l") && lp.has("pad_t") && lp.has("pad_r") && lp.has("pad_b")) {
3196                      DictValue l = lp.get("pad_l");
3197                      DictValue t = lp.get("pad_t");
3198                      DictValue r = lp.get("pad_r");
3199                      DictValue b = lp.get("pad_b");
3200                      out << "pad (HxW): (" << t << ", " << b << ") x (" << l << ", " << r << ")" << "\\l";
3201                  }
3202                  else if (lp.has("pooled_w") || lp.has("pooled_h")) {
3203                      DictValue h = lp.get("pooled_h");
3204                      DictValue w = lp.get("pooled_w");
3205                      out << "pad (HxW): " << h << " x " << w << "\\l";
3206                  }
3207                  if (lp.has("pool")) {
3208                      out << "pool: " << lp.get("pool") << "\\l";
3209                  }
3210                  if (lp.has("global_pooling")) {
3211                      out << "global_pooling: " << lp.get("global_pooling") << "\\l";
3212                  }
3213                  if (lp.has("group")) {
3214                      out << "group: " << lp.get("group") << "\\l";
3215                  }
3216              }
3217          }
3218          if (!it->second.outputBlobs.empty())
3219              out << "output: " << it->second.outputBlobs[0].size << "\\l";
3220
3221          Ptr<BackendNode> layerBackend = it->second.backendNodes[prefBackend];
3222          out << (!layerBackend.empty() ? backend : "OCV/");
3223          int colorId = 0;
3224          switch (it->second.layerInstance->preferableTarget) {
3225              case DNN_TARGET_CPU: out << "CPU\\n"; colorId = layerBackend.empty() ? 0 : 5; break;
3226              case DNN_TARGET_OPENCL: out << "OCL\\n"; colorId = 1; break;
3227              case DNN_TARGET_OPENCL_FP16: out << "OCL_FP16\\n"; colorId = 2; break;
3228              case DNN_TARGET_MYRIAD: out << "MYRIAD\\n"; colorId = 3; break;
3229              case DNN_TARGET_FPGA: out << "FPGA\\n"; colorId = 4; break;
3230          }
3231          out << ((skipId.size() == 1)? "\" " : " }\" ");
3232          out << "fillcolor=\"" << colors[colorId] << "\" ";
3233          out << "style=filled ";
3234          out << "shape=" << ((skipId.size() == 1)? "box" : "record") << "]" << '\n';
3235     }
3236     out << '\n';
3237     // Add edges
3238     int inputsSize = impl->netInputLayer->outNames.size();
3239     for (std::map<int, LayerData>::iterator it = map.begin(); it != map.end(); ++it)
3240     {
3241         if (allLayers[it->first] == -1)  // node
3242         {
3243             for (int i = 0; i < it->second.consumers.size(); i++)
3244             {
3245                 int outId = it->second.consumers[i].lid;
3246                 if (it == map.begin() && inputsSize > 1)
3247                     out << "    " << "\"" << it->second.name << "_" << i << "\"" << " -> ";
3248                 else
3249                     out << "    " << "\"" << it->second.name << "\"" << " -> ";
3250                 if (allLayers[outId] == -1)  // node
3251                     out << "\"" << map[outId].name << "\"" << '\n';
3252                 else  // cluster
3253                     out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3254             }
3255         }
3256         else if (it->first == skippedLayers[allLayers[it->first]].back())  // edges from last layer in cluster
3257         {
3258             for (int i = 0; i < it->second.consumers.size(); i++)
3259             {
3260                 int outId = it->second.consumers[i].lid;
3261                 if (allLayers[outId] == -1) { // node
3262                     out << "    " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3263                     out << "\"" << map[outId].name << "\"" << '\n';
3264                 }
3265                 else if (allLayers[outId] != allLayers[it->first]) { // another cluster
3266                     out << "    " << "\"" << "cluster_" << allLayers[it->first] << "\"" << " -> ";
3267                     out << "\"" << "cluster_" << allLayers[outId] << "\"" << '\n';
3268                 }
3269             }
3270         }
3271     }
3272     out << "}";
3273     return out.str();
3274 }
3275
3276 void Net::dumpToFile(const String& path) {
3277     std::ofstream file(path.c_str());
3278     file << dump();
3279     file.close();
3280 }
3281
3282 Ptr<Layer> Net::getLayer(LayerId layerId)
3283 {
3284     LayerData &ld = impl->getLayerData(layerId);
3285     return ld.getLayerInstance();
3286 }
3287
3288 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
3289 {
3290     LayerData &ld = impl->getLayerData(layerId);
3291     if (!ld.layerInstance)
3292         CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str()));
3293
3294     std::vector<Ptr<Layer> > inputLayers;
3295     inputLayers.reserve(ld.inputLayersId.size());
3296     std::set<int>::iterator it;
3297     for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) {
3298         inputLayers.push_back(getLayer(*it));
3299     }
3300     return inputLayers;
3301 }
3302
3303 std::vector<String> Net::getLayerNames() const
3304 {
3305     std::vector<String> res;
3306     res.reserve(impl->layers.size());
3307
3308     Impl::MapIdToLayerData::iterator it;
3309     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3310     {
3311         if (it->second.id) //skip Data layer
3312             res.push_back(it->second.name);
3313     }
3314
3315     return res;
3316 }
3317
3318 bool Net::empty() const
3319 {
3320     return impl->layers.size() <= 1; //first layer is default Data layer
3321 }
3322
3323 std::vector<int> Net::getUnconnectedOutLayers() const
3324 {
3325     std::vector<int> layersIds;
3326
3327     Impl::MapIdToLayerData::iterator it;
3328     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
3329     {
3330         int lid = it->first;
3331         LayerData &ld = it->second;
3332
3333         if (ld.requiredOutputs.size() == 0)
3334             layersIds.push_back(lid);
3335     }
3336
3337     return layersIds;
3338 }
3339
3340 std::vector<String> Net::getUnconnectedOutLayersNames() const
3341 {
3342     std::vector<int> ids = getUnconnectedOutLayers();
3343     const size_t n = ids.size();
3344     std::vector<String> names(n);
3345     for (size_t i = 0; i < n; ++i)
3346     {
3347         names[i] = impl->layers[ids[i]].name;
3348     }
3349     return names;
3350 }
3351
3352 void Net::getLayersShapes(const ShapesVec& netInputShapes,
3353                           std::vector<int>& layersIds,
3354                           std::vector<ShapesVec>& inLayersShapes,
3355                           std::vector<ShapesVec>& outLayersShapes) const
3356 {
3357     layersIds.clear();
3358     inLayersShapes.clear();
3359     outLayersShapes.clear();
3360
3361     Impl::LayersShapesMap inOutShapes;
3362     impl->getLayersShapes(netInputShapes, inOutShapes);
3363
3364     for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
3365         it != inOutShapes.end(); it++)
3366     {
3367         layersIds.push_back(it->first);
3368         inLayersShapes.push_back(it->second.in);
3369         outLayersShapes.push_back(it->second.out);
3370     }
3371 }
3372
3373 void Net::getLayersShapes(const MatShape& netInputShape,
3374                           std::vector<int>& layerIds,
3375                           std::vector<ShapesVec>& inLayersShapes,
3376                           std::vector<ShapesVec>& outLayersShapes) const
3377 {
3378     getLayersShapes(ShapesVec(1, netInputShape),
3379                     layerIds, inLayersShapes, outLayersShapes);
3380 }
3381
3382 void Net::getLayerShapes(const MatShape& netInputShape,
3383                          const int layerId,
3384                          ShapesVec& inLayerShapes,
3385                          ShapesVec& outLayerShapes) const
3386 {
3387     getLayerShapes(ShapesVec(1, netInputShape),
3388                    layerId, inLayerShapes, outLayerShapes);
3389
3390 }
3391
3392 void Net::getLayerShapes(const ShapesVec& netInputShapes,
3393                     const int layerId,
3394                     ShapesVec& inLayerShapes,
3395                     ShapesVec& outLayerShapes) const
3396 {
3397     LayerShapes shapes;
3398     impl->getLayerShapes(netInputShapes, layerId, shapes);
3399     inLayerShapes = shapes.in;
3400     outLayerShapes = shapes.out;
3401 }
3402
3403 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
3404 {
3405     CV_TRACE_FUNCTION();
3406
3407     int64 flops = 0;
3408     std::vector<int> ids;
3409     std::vector<std::vector<MatShape> > inShapes, outShapes;
3410     getLayersShapes(netInputShapes, ids, inShapes, outShapes);
3411     CV_Assert(inShapes.size() == outShapes.size());
3412     CV_Assert(inShapes.size() == ids.size());
3413
3414     for(int i = 0; i < ids.size(); i++)
3415     {
3416         flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
3417                                                                    outShapes[i]);
3418     }
3419
3420     return flops;
3421 }
3422
3423 int64 Net::getFLOPS(const MatShape& netInputShape) const
3424 {
3425     return getFLOPS(std::vector<MatShape>(1, netInputShape));
3426 }
3427
3428 int64 Net::getFLOPS(const int layerId,
3429               const std::vector<MatShape>& netInputShapes) const
3430 {
3431     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3432     CV_Assert(layer != impl->layers.end());
3433
3434     LayerShapes shapes;
3435     impl->getLayerShapes(netInputShapes, layerId, shapes);
3436
3437     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
3438 }
3439
3440 int64 Net::getFLOPS(const int layerId,
3441               const MatShape& netInputShape) const
3442 {
3443     return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
3444 }
3445
3446 void Net::getLayerTypes(std::vector<String>& layersTypes) const
3447 {
3448     layersTypes.clear();
3449
3450     std::map<String, int> layers;
3451     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3452          it != impl->layers.end(); it++)
3453     {
3454         if (layers.find(it->second.type) == layers.end())
3455             layers[it->second.type] = 0;
3456         layers[it->second.type]++;
3457     }
3458
3459     for (std::map<String, int>::iterator it = layers.begin();
3460          it != layers.end(); it++)
3461     {
3462         layersTypes.push_back(it->first);
3463     }
3464 }
3465
3466 int Net::getLayersCount(const String& layerType) const
3467 {
3468     int count = 0;
3469     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
3470          it != impl->layers.end(); it++)
3471     {
3472         if (it->second.type == layerType)
3473             count++;
3474     }
3475     return count;
3476 }
3477
3478 void Net::getMemoryConsumption(const int layerId,
3479                                const std::vector<MatShape>& netInputShapes,
3480                                size_t& weights, size_t& blobs) const
3481 {
3482     CV_TRACE_FUNCTION();
3483
3484     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
3485     CV_Assert(layer != impl->layers.end());
3486
3487     weights = blobs = 0;
3488
3489     for(int i = 0; i < layer->second.params.blobs.size(); i++)
3490     {
3491         const Mat& weightsBlob = layer->second.params.blobs[i];
3492         weights += weightsBlob.total()*weightsBlob.elemSize();
3493     }
3494
3495     ShapesVec inLayerShapes, outLayerShapes;
3496     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
3497     for(int i = 0; i < outLayerShapes.size(); i++)
3498     {
3499         blobs += total(outLayerShapes[i]) * sizeof(float);
3500     }
3501 }
3502
3503 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3504                                size_t& weights, size_t& blobs) const
3505 {
3506     CV_TRACE_FUNCTION();
3507
3508     std::vector<int> layerIds;
3509     std::vector<size_t> w, b;
3510     getMemoryConsumption(netInputShapes, layerIds, w, b);
3511
3512     weights = blobs = 0;
3513     for(int i = 0; i < layerIds.size(); i++)
3514     {
3515         weights += w[i];
3516         blobs += b[i];
3517     }
3518 }
3519
3520 void Net::getMemoryConsumption(const int layerId,
3521                                const MatShape& netInputShape,
3522                                size_t& weights, size_t& blobs) const
3523 {
3524     getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
3525                          weights, blobs);
3526 }
3527
3528 void Net::getMemoryConsumption(const MatShape& netInputShape,
3529                                size_t& weights, size_t& blobs) const
3530 {
3531     getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
3532                          weights, blobs);
3533 }
3534
3535 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3536                                   std::vector<int>& layerIds, std::vector<size_t>& weights,
3537                                   std::vector<size_t>& blobs) const
3538 {
3539     CV_TRACE_FUNCTION();
3540
3541     layerIds.clear();
3542     weights.clear();
3543     blobs.clear();
3544
3545     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
3546
3547     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
3548
3549     for(int i = 0; i < layerIds.size(); i++)
3550     {
3551         int w = 0, b = 0;
3552         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
3553         CV_Assert(layer != impl->layers.end());
3554
3555         for(int j = 0; j < layer->second.params.blobs.size(); j++)
3556         {
3557             const Mat& weightsBlob = layer->second.params.blobs[j];
3558             w += weightsBlob.total()*weightsBlob.elemSize();
3559         }
3560
3561         for(int j = 0; j < outLayerShapes[i].size(); j++)
3562         {
3563             b += total(outLayerShapes[i][j]) * sizeof(float);
3564         }
3565
3566         weights.push_back(w);
3567         blobs.push_back(b);
3568     }
3569 }
3570
3571 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
3572                                std::vector<size_t>& weights, std::vector<size_t>& blobs) const
3573 {
3574     getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
3575                          weights, blobs);
3576 }
3577
3578 void Net::enableFusion(bool fusion)
3579 {
3580     if( impl->fusion != fusion )
3581     {
3582         impl->fusion = fusion;
3583         impl->netWasAllocated = false;
3584         impl->clear();
3585     }
3586 }
3587
3588 void Net::setHalideScheduler(const String& scheduler)
3589 {
3590     CV_TRACE_FUNCTION();
3591     CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
3592
3593     impl->halideConfigFile = scheduler;
3594 }
3595
3596 int64 Net::getPerfProfile(std::vector<double>& timings)
3597 {
3598     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
3599     int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
3600     return total;
3601 }
3602
3603 //////////////////////////////////////////////////////////////////////////
3604
3605 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
3606
3607 Layer::Layer(const LayerParams &params)
3608     : blobs(params.blobs), name(params.name), type(params.type)
3609 {
3610     preferableTarget = DNN_TARGET_CPU;
3611 }
3612
3613 void Layer::setParamsFrom(const LayerParams &params)
3614 {
3615     blobs = params.blobs;
3616     name = params.name;
3617     type = params.type;
3618 }
3619
3620 int Layer::inputNameToIndex(String)
3621 {
3622     return -1;
3623 }
3624
3625 int Layer::outputNameToIndex(const String&)
3626 {
3627     return 0;
3628 }
3629
3630 bool Layer::supportBackend(int backendId)
3631 {
3632     return backendId == DNN_BACKEND_OPENCV;
3633 }
3634
3635 Ptr<BackendNode> Layer::initVkCom(const std::vector<Ptr<BackendWrapper> > &)
3636 {
3637     CV_Error(Error::StsNotImplemented, "VkCom pipeline of " + type +
3638                                        " layers is not defined.");
3639     return Ptr<BackendNode>();
3640 }
3641
3642 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
3643 {
3644     CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
3645                                        " layers is not defined.");
3646     return Ptr<BackendNode>();
3647 }
3648
3649 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
3650 {
3651     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
3652                                        " layers is not defined.");
3653     return Ptr<BackendNode>();
3654 }
3655
3656 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
3657                                  const std::vector<Mat> &outputs, int targetId) const
3658 {
3659 #ifdef  HAVE_HALIDE
3660     CV_TRACE_FUNCTION();
3661
3662     Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
3663                 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
3664     Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
3665
3666     int outW, outH, outC, outN;
3667     getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
3668
3669     if (targetId == DNN_TARGET_CPU)
3670     {
3671         if (outW == 1 && outH == 1)
3672         {
3673             if (outC + outN == 1)
3674                 return;
3675
3676             if (outC > 8)
3677               top.split(c, co, ci, 8)
3678                  .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3679                  .parallel(tile)
3680                  .vectorize(ci, 8);
3681             else
3682               top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
3683                  .parallel(tile);
3684         }
3685         else
3686         {
3687             if (outH > 2)
3688             {
3689                 top.reorder(x, c, y)
3690                    .split(y, yo, yi, 2)
3691                    .fuse(yo, n, tile)
3692                    .parallel(tile)
3693                    .unroll(yi)
3694                    .vectorize(x, outW >= 16 ? 16 : outW);
3695             }
3696         }
3697     }
3698     else if (targetId == DNN_TARGET_OPENCL)
3699     {
3700         if (outW == 1 && outH == 1)
3701         {
3702             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
3703             top.split(c, co, ci, c_split)
3704                .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3705                .gpu_blocks(tile)
3706                .gpu_threads(ci);
3707         }
3708         else
3709         {
3710             int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
3711             int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
3712             // Supported vectorization widths: 2, 3, 4, 8, 16
3713             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
3714             top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
3715                .split(c, co, ci, c_split)
3716                .gpu_blocks(xo, yo, co)
3717                .gpu_threads(xi, yi)
3718                .reorder(xi, yi, ci, xo, yo, co)
3719                .vectorize(ci);
3720         }
3721     }
3722     else
3723         CV_Error(Error::StsNotImplemented, "Unknown target identifier");
3724 #endif  // HAVE_HALIDE
3725 }
3726
3727 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
3728 {
3729     return Ptr<BackendNode>();
3730 }
3731
3732 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
3733 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
3734 void Layer::getScaleShift(Mat& scale, Mat& shift) const
3735 {
3736     scale = Mat();
3737     shift = Mat();
3738 }
3739
3740 void Layer::unsetAttached()
3741 {
3742     setActivation(Ptr<ActivationLayer>());
3743 }
3744
3745 template <typename T>
3746 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
3747 {
3748     pv.resize(v.size());
3749     for (size_t i = 0; i < v.size(); i++)
3750         pv[i] = const_cast<T*>(&v[i]);
3751 }
3752
3753 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
3754 {
3755     CV_TRACE_FUNCTION();
3756     this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
3757 }
3758
3759 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
3760 {
3761     CV_UNUSED(input);CV_UNUSED(output);
3762 }
3763
3764 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
3765 {
3766     CV_TRACE_FUNCTION();
3767     std::vector<Mat> inputs, outputs;
3768     inputs_arr.getMatVector(inputs);
3769     outputs_arr.getMatVector(outputs);
3770
3771     std::vector<Mat*> inputsp;
3772     vecToPVec(inputs, inputsp);
3773     this->finalize(inputsp, outputs);
3774 }
3775
3776 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
3777 {
3778     CV_TRACE_FUNCTION();
3779
3780     std::vector<Mat> outputs;
3781     this->finalize(inputs, outputs);
3782     return outputs;
3783 }
3784
3785 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
3786 {
3787     // We kept this method for compatibility. DNN calls it now only to support users' implementations.
3788 }
3789
3790 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3791 {
3792     CV_TRACE_FUNCTION();
3793     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3794
3795     Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
3796 }
3797
3798 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3799 {
3800     CV_TRACE_FUNCTION();
3801     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3802
3803     if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
3804     {
3805         std::vector<UMat> inputs;
3806         std::vector<UMat> outputs;
3807         std::vector<UMat> internals;
3808
3809         std::vector<UMat> orig_inputs;
3810         std::vector<UMat> orig_outputs;
3811         std::vector<UMat> orig_internals;
3812
3813         inputs_arr.getUMatVector(orig_inputs);
3814         outputs_arr.getUMatVector(orig_outputs);
3815         internals_arr.getUMatVector(orig_internals);
3816
3817         inputs.resize(orig_inputs.size());
3818         for (size_t i = 0; i < orig_inputs.size(); i++)
3819             convertFp16(orig_inputs[i], inputs[i]);
3820
3821         outputs.resize(orig_outputs.size());
3822         for (size_t i = 0; i < orig_outputs.size(); i++)
3823             outputs[i].create(shape(orig_outputs[i]), CV_32F);
3824
3825         internals.resize(orig_internals.size());
3826         for (size_t i = 0; i < orig_internals.size(); i++)
3827             internals[i].create(shape(orig_internals[i]), CV_32F);
3828
3829         forward(inputs, outputs, internals);
3830
3831         for (size_t i = 0; i < outputs.size(); i++)
3832             convertFp16(outputs[i], orig_outputs[i]);
3833
3834         // sync results back
3835         outputs_arr.assign(orig_outputs);
3836         internals_arr.assign(orig_internals);
3837         return;
3838     }
3839     std::vector<Mat> inpvec;
3840     std::vector<Mat> outputs;
3841     std::vector<Mat> internals;
3842
3843     inputs_arr.getMatVector(inpvec);
3844     outputs_arr.getMatVector(outputs);
3845     internals_arr.getMatVector(internals);
3846
3847     std::vector<Mat*> inputs(inpvec.size());
3848     for (int i = 0; i < inpvec.size(); i++)
3849         inputs[i] = &inpvec[i];
3850
3851     this->forward(inputs, outputs, internals);
3852
3853     // sync results back
3854     outputs_arr.assign(outputs);
3855     internals_arr.assign(internals);
3856 }
3857
3858 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
3859 {
3860     CV_TRACE_FUNCTION();
3861
3862     this->finalize(inputs, outputs);
3863     this->forward(inputs, outputs, internals);
3864 }
3865
3866 Layer::~Layer() {}
3867
3868 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
3869                             const int requiredOutputs,
3870                             std::vector<MatShape> &outputs,
3871                             std::vector<MatShape> &internals) const
3872 {
3873     CV_Assert(inputs.size());
3874     outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
3875     return false;
3876 }
3877
3878 //////////////////////////////////////////////////////////////////////////
3879
3880 static Mutex& getLayerFactoryMutex()
3881 {
3882     static Mutex* volatile instance = NULL;
3883     if (instance == NULL)
3884     {
3885         cv::AutoLock lock(getInitializationMutex());
3886         if (instance == NULL)
3887             instance = new Mutex();
3888     }
3889     return *instance;
3890 }
3891
3892 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
3893
3894 static LayerFactory_Impl& getLayerFactoryImpl_()
3895 {
3896     static LayerFactory_Impl impl;
3897     return impl;
3898 }
3899
3900 static LayerFactory_Impl& getLayerFactoryImpl()
3901 {
3902     static LayerFactory_Impl* volatile instance = NULL;
3903     if (instance == NULL)
3904     {
3905         cv::AutoLock lock(getLayerFactoryMutex());
3906         if (instance == NULL)
3907         {
3908             instance = &getLayerFactoryImpl_();
3909             initializeLayerFactory();
3910         }
3911     }
3912     return *instance;
3913 }
3914
3915 void LayerFactory::registerLayer(const String &type, Constructor constructor)
3916 {
3917     CV_TRACE_FUNCTION();
3918     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3919
3920     cv::AutoLock lock(getLayerFactoryMutex());
3921     String type_ = toLowerCase(type);
3922     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3923
3924     if (it != getLayerFactoryImpl().end())
3925     {
3926         if (it->second.back() == constructor)
3927             CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered");
3928         it->second.push_back(constructor);
3929     }
3930     getLayerFactoryImpl().insert(std::make_pair(type_, std::vector<Constructor>(1, constructor)));
3931 }
3932
3933 void LayerFactory::unregisterLayer(const String &type)
3934 {
3935     CV_TRACE_FUNCTION();
3936     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3937
3938     cv::AutoLock lock(getLayerFactoryMutex());
3939     String type_ = toLowerCase(type);
3940
3941     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3942     if (it != getLayerFactoryImpl().end())
3943     {
3944         if (it->second.size() > 1)
3945             it->second.pop_back();
3946         else
3947             getLayerFactoryImpl().erase(it);
3948     }
3949 }
3950
3951 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
3952 {
3953     CV_TRACE_FUNCTION();
3954     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3955
3956     cv::AutoLock lock(getLayerFactoryMutex());
3957     String type_ = toLowerCase(type);
3958     LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_);
3959
3960     if (it != getLayerFactoryImpl().end())
3961     {
3962         CV_Assert(!it->second.empty());
3963         return it->second.back()(params);
3964     }
3965     else
3966     {
3967         return Ptr<Layer>(); //NULL
3968     }
3969 }
3970
3971 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
3972
3973 BackendNode::~BackendNode() {};
3974
3975 BackendWrapper::BackendWrapper(int backendId, int targetId)
3976     : backendId(backendId), targetId(targetId) {}
3977
3978 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
3979 {
3980     CV_Error(Error::StsNotImplemented,
3981              "Constructor of backend wrapper must be implemented");
3982 }
3983
3984 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
3985 {
3986     CV_Error(Error::StsNotImplemented,
3987              "Constructor of backend wrapper must be implemented");
3988 }
3989
3990 BackendWrapper::~BackendWrapper() {}
3991
3992 Net readNet(const String& _model, const String& _config, const String& _framework)
3993 {
3994     String framework = toLowerCase(_framework);
3995     String model = _model;
3996     String config = _config;
3997     const std::string modelExt = model.substr(model.rfind('.') + 1);
3998     const std::string configExt = config.substr(config.rfind('.') + 1);
3999     if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
4000                                 modelExt == "prototxt" || configExt == "prototxt")
4001     {
4002         if (modelExt == "prototxt" || configExt == "caffemodel")
4003             std::swap(model, config);
4004         return readNetFromCaffe(config, model);
4005     }
4006     if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
4007                                      modelExt == "pbtxt" || configExt == "pbtxt")
4008     {
4009         if (modelExt == "pbtxt" || configExt == "pb")
4010             std::swap(model, config);
4011         return readNetFromTensorflow(model, config);
4012     }
4013     if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
4014                                 configExt == "t7" || configExt == "net")
4015     {
4016         return readNetFromTorch(model.empty() ? config : model);
4017     }
4018     if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
4019                                   modelExt == "cfg" || configExt == "cfg")
4020     {
4021         if (modelExt == "cfg" || configExt == "weights")
4022             std::swap(model, config);
4023         return readNetFromDarknet(config, model);
4024     }
4025     if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
4026                                modelExt == "xml" || configExt == "xml")
4027     {
4028         if (modelExt == "xml" || configExt == "bin")
4029             std::swap(model, config);
4030         return readNetFromModelOptimizer(config, model);
4031     }
4032     if (framework == "onnx" || modelExt == "onnx")
4033     {
4034         return readNetFromONNX(model);
4035     }
4036     CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
4037                                       model + (config.empty() ? "" : ", " + config));
4038 }
4039
4040 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
4041             const std::vector<uchar>& bufferConfig)
4042 {
4043     String framework = toLowerCase(_framework);
4044     if (framework == "caffe")
4045         return readNetFromCaffe(bufferConfig, bufferModel);
4046     else if (framework == "tensorflow")
4047         return readNetFromTensorflow(bufferModel, bufferConfig);
4048     else if (framework == "darknet")
4049         return readNetFromDarknet(bufferConfig, bufferModel);
4050     else if (framework == "torch")
4051         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
4052     else if (framework == "dldt")
4053         CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
4054     CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
4055 }
4056
4057 Net readNetFromModelOptimizer(const String &xml, const String &bin)
4058 {
4059     return Net::readFromModelOptimizer(xml, bin);
4060 }
4061
4062 CV__DNN_INLINE_NS_END
4063 }} // namespace