modules/dnn/src/dnn.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Third party copyrights are property of their respective owners.
  15 //
  16 // Redistribution and use in source and binary forms, with or without modification,
  17 // are permitted provided that the following conditions are met:
  18 //
  19 //   * Redistribution's of source code must retain the above copyright notice,
  20 //     this list of conditions and the following disclaimer.
  21 //
  22 //   * Redistribution's in binary form must reproduce the above copyright notice,
  23 //     this list of conditions and the following disclaimer in the documentation
  24 //     and/or other materials provided with the distribution.
  25 //
  26 //   * The name of the copyright holders may not be used to endorse or promote products
  27 //     derived from this software without specific prior written permission.
  28 //
  29 // This software is provided by the copyright holders and contributors "as is" and
  30 // any express or implied warranties, including, but not limited to, the implied
  31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  32 // In no event shall the Intel Corporation or contributors be liable for any direct,
  33 // indirect, incidental, special, exemplary, or consequential damages
  34 // (including, but not limited to, procurement of substitute goods or services;
  35 // loss of use, data, or profits; or business interruption) however caused
  36 // and on any theory of liability, whether in contract, strict liability,
  37 // or tort (including negligence or otherwise) arising in any way out of
  38 // the use of this software, even if advised of the possibility of such damage.
  39 //
  40 //M*/
  41
  42 #include "precomp.hpp"
  43 #include "op_halide.hpp"
  44 #include "op_inf_engine.hpp"
  45 #include "halide_scheduler.hpp"
  46 #include <set>
  47 #include <algorithm>
  48 #include <iostream>
  49 #include <sstream>
  50 #include <iterator>
  51 #include <numeric>
  52 #include <opencv2/dnn/shape_utils.hpp>
  53 #include <opencv2/imgproc.hpp>
  54
  55 #include <opencv2/core/utils/configuration.private.hpp>
  56 #include <opencv2/core/utils/logger.hpp>
  57
  58 namespace cv {
  59 namespace dnn {
  60 CV__DNN_EXPERIMENTAL_NS_BEGIN
  61
  62 // this option is useful to run valgrind memory errors detection
  63 static bool DNN_DISABLE_MEMORY_OPTIMIZATIONS = utils::getConfigurationParameterBool("OPENCV_DNN_DISABLE_MEMORY_OPTIMIZATIONS", false);
  64
  65 #ifdef HAVE_OPENCL
  66 static bool DNN_OPENCL_ALLOW_ALL_DEVICES = utils::getConfigurationParameterBool("OPENCV_DNN_OPENCL_ALLOW_ALL_DEVICES", false);
  67 #endif
  68
  69 static int PARAM_DNN_BACKEND_DEFAULT = (int)utils::getConfigurationParameterSizeT("OPENCV_DNN_BACKEND_DEFAULT",
  70 #ifdef HAVE_INF_ENGINE
  71     (size_t)DNN_BACKEND_INFERENCE_ENGINE
  72 #else
  73     (size_t)DNN_BACKEND_OPENCV
  74 #endif
  75 );
  76
  77 // Additional checks (slowdowns execution!)
  78 static bool DNN_CHECK_NAN_INF = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF", false);
  79 static bool DNN_CHECK_NAN_INF_DUMP = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_DUMP", false);
  80 static bool DNN_CHECK_NAN_INF_RAISE_ERROR = utils::getConfigurationParameterBool("OPENCV_DNN_CHECK_NAN_INF_RAISE_ERROR", false);
  81
  82 using std::vector;
  83 using std::map;
  84 using std::make_pair;
  85 using std::set;
  86
  87 namespace
  88 {
  89     typedef std::vector<MatShape> ShapesVec;
  90
  91     struct LayerShapes
  92     {
  93         ShapesVec in, out, internal;
  94         // No guarantees that layer which support in-place computations
  95         // will be computed in-place (input.data_ptr == output.data_ptr).
  96         // If layer said that it could work in-place and layers after it
  97         // no longer use input blob, we'll set output = input.
  98         bool supportInPlace;
  99         LayerShapes() {supportInPlace = false;}
 100     };
 101 }
 102
 103 Mat blobFromImage(InputArray image, double scalefactor, const Size& size,
 104                   const Scalar& mean, bool swapRB, bool crop, int ddepth)
 105 {
 106     CV_TRACE_FUNCTION();
 107     Mat blob;
 108     blobFromImage(image, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 109     return blob;
 110 }
 111
 112 void blobFromImage(InputArray image, OutputArray blob, double scalefactor,
 113                    const Size& size, const Scalar& mean, bool swapRB, bool crop, int ddepth)
 114 {
 115     CV_TRACE_FUNCTION();
 116     std::vector<Mat> images(1, image.getMat());
 117     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 118 }
 119
 120 Mat blobFromImages(InputArrayOfArrays images, double scalefactor, Size size,
 121                    const Scalar& mean, bool swapRB, bool crop, int ddepth)
 122 {
 123     CV_TRACE_FUNCTION();
 124     Mat blob;
 125     blobFromImages(images, blob, scalefactor, size, mean, swapRB, crop, ddepth);
 126     return blob;
 127 }
 128
 129 void blobFromImages(InputArrayOfArrays images_, OutputArray blob_, double scalefactor,
 130                     Size size, const Scalar& mean_, bool swapRB, bool crop, int ddepth)
 131 {
 132     CV_TRACE_FUNCTION();
 133     CV_CheckType(ddepth, ddepth == CV_32F || ddepth == CV_8U, "Blob depth should be CV_32F or CV_8U");
 134     if (ddepth == CV_8U)
 135     {
 136         CV_CheckEQ(scalefactor, 1.0, "Scaling is not supported for CV_8U blob depth");
 137         CV_Assert(mean_ == Scalar() && "Mean subtraction is not supported for CV_8U blob depth");
 138     }
 139
 140     std::vector<Mat> images;
 141     images_.getMatVector(images);
 142     CV_Assert(!images.empty());
 143     for (int i = 0; i < images.size(); i++)
 144     {
 145         Size imgSize = images[i].size();
 146         if (size == Size())
 147             size = imgSize;
 148         if (size != imgSize)
 149         {
 150             if(crop)
 151             {
 152               float resizeFactor = std::max(size.width / (float)imgSize.width,
 153                                             size.height / (float)imgSize.height);
 154               resize(images[i], images[i], Size(), resizeFactor, resizeFactor, INTER_LINEAR);
 155               Rect crop(Point(0.5 * (images[i].cols - size.width),
 156                               0.5 * (images[i].rows - size.height)),
 157                         size);
 158               images[i] = images[i](crop);
 159             }
 160             else
 161               resize(images[i], images[i], size, 0, 0, INTER_LINEAR);
 162         }
 163         if(images[i].depth() == CV_8U && ddepth == CV_32F)
 164             images[i].convertTo(images[i], CV_32F);
 165         Scalar mean = mean_;
 166         if (swapRB)
 167             std::swap(mean[0], mean[2]);
 168
 169         images[i] -= mean;
 170         images[i] *= scalefactor;
 171     }
 172
 173     size_t i, nimages = images.size();
 174     Mat image0 = images[0];
 175     int nch = image0.channels();
 176     CV_Assert(image0.dims == 2);
 177     Mat image;
 178     if (nch == 3 || nch == 4)
 179     {
 180         int sz[] = { (int)nimages, nch, image0.rows, image0.cols };
 181         blob_.create(4, sz, ddepth);
 182         Mat blob = blob_.getMat();
 183         Mat ch[4];
 184
 185         for( i = 0; i < nimages; i++ )
 186         {
 187             image = images[i];
 188             CV_Assert(image.depth() == blob_.depth());
 189             nch = image.channels();
 190             CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
 191             CV_Assert(image.size() == image0.size());
 192
 193             for( int j = 0; j < nch; j++ )
 194                 ch[j] = Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, j));
 195             if(swapRB)
 196                 std::swap(ch[0], ch[2]);
 197             split(image, ch);
 198         }
 199     }
 200     else
 201     {
 202        CV_Assert(nch == 1);
 203        int sz[] = { (int)nimages, 1, image0.rows, image0.cols };
 204        blob_.create(4, sz, ddepth);
 205        Mat blob = blob_.getMat();
 206
 207        for( i = 0; i < nimages; i++ )
 208        {
 209            Mat image = images[i];
 210            CV_Assert(image.depth() == blob_.depth());
 211            nch = image.channels();
 212            CV_Assert(image.dims == 2 && (nch == 1));
 213            CV_Assert(image.size() == image0.size());
 214
 215            image.copyTo(Mat(image.rows, image.cols, ddepth, blob.ptr((int)i, 0)));
 216        }
 217     }
 218 }
 219
 220 void imagesFromBlob(const cv::Mat& blob_, OutputArrayOfArrays images_)
 221 {
 222     CV_TRACE_FUNCTION();
 223
 224     //A blob is a 4 dimensional matrix in floating point precision
 225     //blob_[0] = batchSize = nbOfImages
 226     //blob_[1] = nbOfChannels
 227     //blob_[2] = height
 228     //blob_[3] = width
 229     CV_Assert(blob_.depth() == CV_32F);
 230     CV_Assert(blob_.dims == 4);
 231
 232     images_.create(cv::Size(1, blob_.size[0]), blob_.depth());
 233
 234     std::vector<Mat> vectorOfChannels(blob_.size[1]);
 235     for (int n = 0; n <  blob_.size[0]; ++n)
 236     {
 237         for (int c = 0; c < blob_.size[1]; ++c)
 238         {
 239             vectorOfChannels[c] = getPlane(blob_, n, c);
 240         }
 241         cv::merge(vectorOfChannels, images_.getMatRef(n));
 242     }
 243 }
 244
 245 class OpenCLBackendWrapper : public BackendWrapper
 246 {
 247 public:
 248     OpenCLBackendWrapper(Mat& m) : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
 249     {
 250         m.copyTo(umat);
 251         host = &m;
 252         hostDirty = false;
 253     }
 254
 255     OpenCLBackendWrapper(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
 256         : BackendWrapper(DNN_BACKEND_OPENCV, DNN_TARGET_OPENCL)
 257     {
 258         Ptr<OpenCLBackendWrapper> base = baseBuffer.dynamicCast<OpenCLBackendWrapper>();
 259         CV_Assert(!base.empty());
 260
 261         host = &m;
 262
 263         int shape[] = {1, (int)base->umat.total()};
 264         umat = base->umat.reshape(1, 2, &shape[0])
 265                          .colRange(0, host->total())
 266                          .reshape(1, host->dims, &host->size[0]);
 267         hostDirty = false;
 268     }
 269
 270     static Ptr<BackendWrapper> create(Mat& m)
 271     {
 272         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(m));
 273     }
 274
 275     static Ptr<BackendWrapper> create(const Ptr<BackendWrapper>& baseBuffer, Mat& m)
 276     {
 277         return Ptr<BackendWrapper>(new OpenCLBackendWrapper(baseBuffer, m));
 278     }
 279
 280     static std::vector<UMat> getUMatVector(const std::vector<Ptr<BackendWrapper> >& wrappers)
 281     {
 282         const int numWrappers = wrappers.size();
 283         std::vector<UMat> mats(wrappers.size());
 284         for (int i = 0; i < numWrappers; ++i)
 285         {
 286             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
 287             CV_Assert(!umatWrapper.empty());
 288             umatWrapper->copyToDevice();
 289             mats[i] = umatWrapper->umat;
 290         }
 291         return mats;
 292     }
 293
 294     // Replaces all umats in wrappers to specific ones.
 295     static void update(const std::vector<Ptr<BackendWrapper> >& wrappers,
 296                        const std::vector<UMat>& umats)
 297     {
 298         CV_Assert(wrappers.size() == umats.size());
 299         for (int i = 0, n = umats.size(); i < n; ++i)
 300         {
 301             Ptr<OpenCLBackendWrapper> umatWrapper = wrappers[i].dynamicCast<OpenCLBackendWrapper>();
 302             CV_Assert(!umatWrapper.empty());
 303             umatWrapper->umat = umats[i];
 304         }
 305     }
 306
 307     ~OpenCLBackendWrapper() {}
 308
 309     // Copies data from device to a host memory.
 310     virtual void copyToHost() CV_OVERRIDE
 311     {
 312         umat.copyTo(*host);
 313     }
 314
 315     virtual void setHostDirty() CV_OVERRIDE
 316     {
 317         hostDirty = true;
 318     };
 319
 320     void copyToDevice()
 321     {
 322         if (hostDirty)
 323         {
 324             host->copyTo(umat);
 325             hostDirty = false;
 326         }
 327     }
 328
 329 private:
 330     UMat umat;
 331     Mat* host;
 332     bool hostDirty;
 333 };
 334
 335 struct LayerPin
 336 {
 337     int lid;
 338     int oid;
 339
 340     LayerPin(int layerId = -1, int outputId = -1)
 341         : lid(layerId), oid(outputId) {}
 342
 343     bool valid() const
 344     {
 345         return (lid >= 0 && oid >= 0);
 346     }
 347
 348     bool equal(const LayerPin &r) const
 349     {
 350         return (lid == r.lid && oid == r.oid);
 351     }
 352
 353     bool operator<(const LayerPin &r) const
 354     {
 355         return lid < r.lid || lid == r.lid && oid < r.oid;
 356     }
 357
 358     bool operator ==(const LayerPin &r) const
 359     {
 360         return lid == r.lid && oid == r.oid;
 361     }
 362 };
 363
 364 struct LayerData
 365 {
 366     LayerData() : id(-1), skip(false), flag(0) {}
 367     LayerData(int _id, const String &_name, const String &_type, LayerParams &_params)
 368         : id(_id), name(_name), type(_type), params(_params), skip(false), flag(0)
 369     {
 370         CV_TRACE_FUNCTION();
 371
 372         //add logging info
 373         params.name = name;
 374         params.type = type;
 375     }
 376
 377     int id;
 378     String name;
 379     String type;
 380     LayerParams params;
 381
 382     std::vector<LayerPin> inputBlobsId;
 383     std::set<int> inputLayersId;
 384     std::set<int> requiredOutputs;
 385     std::vector<LayerPin> consumers;
 386     std::vector<Ptr<BackendWrapper> > outputBlobsWrappers;
 387     std::vector<Ptr<BackendWrapper> > inputBlobsWrappers;
 388     std::vector<Ptr<BackendWrapper> > internalBlobsWrappers;
 389
 390     Ptr<Layer> layerInstance;
 391     std::vector<Mat> outputBlobs;
 392     std::vector<Mat*> inputBlobs;
 393     std::vector<Mat> internals;
 394     // Computation nodes of implemented backends (except DEFAULT).
 395     std::map<int, Ptr<BackendNode> > backendNodes;
 396     // Flag for skip layer computation for specific backend.
 397     bool skip;
 398
 399     int flag;
 400
 401     Ptr<Layer> getLayerInstance()
 402     {
 403         CV_TRACE_FUNCTION();
 404         CV_TRACE_ARG_VALUE(type, "type", type.c_str());
 405
 406         if (layerInstance)
 407             return layerInstance;
 408
 409         layerInstance = LayerFactory::createLayerInstance(type, params);
 410         if (!layerInstance)
 411         {
 412             CV_Error(Error::StsError, "Can't create layer \"" + name + "\" of type \"" + type + "\"");
 413         }
 414
 415         return layerInstance;
 416     }
 417 };
 418
 419 //fake layer containing network input blobs
 420 struct DataLayer : public Layer
 421 {
 422     DataLayer() : Layer()
 423     {
 424         skip = false;
 425     }
 426
 427     virtual bool supportBackend(int backendId) CV_OVERRIDE
 428     {
 429         return backendId == DNN_BACKEND_OPENCV ||
 430                backendId == DNN_BACKEND_INFERENCE_ENGINE && inputsData.size() == 1;
 431     }
 432
 433     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 434     {
 435         CV_TRACE_FUNCTION();
 436         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 437
 438         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 439                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 440
 441         if (outputs_arr.depth() == CV_16S)
 442         {
 443             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 444             return;
 445         }
 446
 447         std::vector<Mat> outputs, internals;
 448         outputs_arr.getMatVector(outputs);
 449         internals_arr.getMatVector(internals);
 450
 451         // Supported modes:
 452         // | Input type | Output type |
 453         // |       fp32 |        fp32 |
 454         // |      uint8 |        fp32 |
 455         for (int i = 0; i < inputsData.size(); ++i)
 456         {
 457             double scale = scaleFactors[i];
 458             Scalar& mean = means[i];
 459             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
 460             CV_CheckTypeEQ(outputs[i].type(), CV_32FC1, "");
 461
 462             bool singleMean = true;
 463             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
 464             {
 465                 singleMean = mean[j] == mean[j - 1];
 466             }
 467
 468             if (singleMean)
 469             {
 470                 inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
 471             }
 472             else
 473             {
 474                 for (int n = 0; n < inputsData[i].size[0]; ++n)
 475                     for (int c = 0; c < inputsData[i].size[1]; ++c)
 476                     {
 477                         Mat inp = getPlane(inputsData[i], n, c);
 478                         Mat out = getPlane(outputs[i], n, c);
 479                         inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
 480                     }
 481             }
 482         }
 483     }
 484
 485 #ifdef HAVE_OPENCL
 486     std::vector<Mat> tmp_expressions;
 487     bool forward_ocl(InputArrayOfArrays, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
 488     {
 489         // Supported modes:
 490         // | Input type | Output type |
 491         // |       fp32 |        fp32 |
 492         // |       fp32 |        fp16 |
 493         // |      uint8 |        fp32 |
 494         std::vector<UMat> outputs;
 495         outputs_.getUMatVector(outputs);
 496
 497         tmp_expressions.clear();
 498         for (int i = 0; i < inputsData.size(); ++i)
 499         {
 500             Mat inputData = inputsData[i];
 501
 502             double scale = scaleFactors[i];
 503             Scalar& mean = means[i];
 504
 505             CV_Assert(mean == Scalar() || inputsData[i].size[1] <= 4);
 506             bool singleMean = true;
 507             for (int j = 1; j < std::min(4, inputsData[i].size[1]) && singleMean; ++j)
 508             {
 509                 singleMean = mean[j] == mean[j - 1];
 510             }
 511
 512             if (outputs_.depth() == CV_16S)
 513             {
 514                 if (singleMean)
 515                 {
 516                     tmp_expressions.push_back(Mat(scale * (inputsData[i] - mean[0])));
 517                     convertFp16(tmp_expressions.back(), outputs[i]);
 518                 }
 519                 else
 520                 {
 521                     for (int n = 0; n < inputsData[i].size[0]; ++n)
 522                         for (int c = 0; c < inputsData[i].size[1]; ++c)
 523                         {
 524                             Mat inp = getPlane(inputsData[i], n, c);
 525
 526                             std::vector<cv::Range> plane(4, Range::all());
 527                             plane[0] = Range(n, n + 1);
 528                             plane[1] = Range(c, c + 1);
 529                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
 530
 531                             tmp_expressions.push_back(scale * (inp - mean[c]));
 532                             convertFp16(tmp_expressions.back(), out);
 533                         }
 534                 }
 535             }
 536             else
 537             {
 538                 CV_Assert(outputs_.depth() == CV_32F);
 539                 if (singleMean)
 540                 {
 541                     inputsData[i].convertTo(outputs[i], CV_32F, scale, -mean[0] * scale);
 542                 }
 543                 else
 544                 {
 545                     for (int n = 0; n < inputsData[i].size[0]; ++n)
 546                         for (int c = 0; c < inputsData[i].size[1]; ++c)
 547                         {
 548                             Mat inp = getPlane(inputsData[i], n, c);
 549
 550                             std::vector<cv::Range> plane(4, Range::all());
 551                             plane[0] = Range(n, n + 1);
 552                             plane[1] = Range(c, c + 1);
 553                             UMat out = outputs[i](plane).reshape(1, inp.dims, inp.size);
 554
 555                             inp.convertTo(out, CV_32F, scale, -mean[c] * scale);
 556                         }
 557                 }
 558             }
 559         }
 560         return true;
 561     }
 562 #endif
 563
 564     int outputNameToIndex(const String& tgtName) CV_OVERRIDE
 565     {
 566         int idx = (int)(std::find(outNames.begin(), outNames.end(), tgtName) - outNames.begin());
 567         return (idx < (int)outNames.size()) ? idx : -1;
 568     }
 569
 570     void setNames(const std::vector<String> &names)
 571     {
 572         outNames.assign(names.begin(), names.end());
 573     }
 574
 575     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 576                          const int requiredOutputs,
 577                          std::vector<MatShape> &outputs,
 578                          std::vector<MatShape> &internals) const CV_OVERRIDE
 579     {
 580         CV_Assert(inputs.size() == requiredOutputs);
 581         outputs.assign(inputs.begin(), inputs.end());
 582         return false;
 583     }
 584
 585     virtual void finalize(InputArrayOfArrays, OutputArrayOfArrays outputs_arr) CV_OVERRIDE
 586     {
 587         std::vector<Mat> outputs;
 588         outputs_arr.getMatVector(outputs);
 589
 590         CV_Assert_N(outputs.size() == scaleFactors.size(), outputs.size() == means.size(),
 591                   inputsData.size() == outputs.size());
 592         skip = true;
 593         for (int i = 0; skip && i < inputsData.size(); ++i)
 594         {
 595             if (inputsData[i].data != outputs[i].data || scaleFactors[i] != 1.0 || means[i] != Scalar())
 596                 skip = false;
 597         }
 598     }
 599
 600     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >&) CV_OVERRIDE
 601     {
 602 #ifdef HAVE_INF_ENGINE
 603         InferenceEngine::LayerParams lp;
 604         lp.name = name;
 605         lp.type = "ScaleShift";
 606         lp.precision = InferenceEngine::Precision::FP32;
 607         std::shared_ptr<InferenceEngine::ScaleShiftLayer> ieLayer(new InferenceEngine::ScaleShiftLayer(lp));
 608
 609         CV_CheckEQ(inputsData.size(), (size_t)1, "");
 610         CV_CheckEQ(inputsData[0].dims, 4, "");
 611         const size_t numChannels = inputsData[0].size[1];
 612         CV_Assert(numChannels <= 4);
 613
 614         // Scale
 615         auto weights = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
 616                                                                 {numChannels});
 617         weights->allocate();
 618         weights->set(std::vector<float>(numChannels, scaleFactors[0]));
 619         ieLayer->_weights = weights;
 620
 621         // Mean subtraction
 622         auto biases = InferenceEngine::make_shared_blob<float>(InferenceEngine::Precision::FP32,
 623                                                                {numChannels});
 624         biases->allocate();
 625         std::vector<float> biasesVec(numChannels);
 626         for (int i = 0; i < numChannels; ++i)
 627         {
 628             biasesVec[i] = -means[0][i] * scaleFactors[0];
 629         }
 630         biases->set(biasesVec);
 631         ieLayer->_biases = biases;
 632
 633         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 634 #endif  // HAVE_INF_ENGINE
 635         return Ptr<BackendNode>();
 636     }
 637
 638     std::vector<String> outNames;
 639     // Preprocessing parameters for each network's input.
 640     std::vector<double> scaleFactors;
 641     std::vector<Scalar> means;
 642     std::vector<Mat> inputsData;
 643     bool skip;
 644 };
 645
 646 struct BlobManager
 647 {
 648 public:
 649     // Increase references counter to layer output.
 650     void addReference(const LayerPin& lp)
 651     {
 652         std::map<LayerPin, int>::iterator it = refCounter.find(lp);
 653         if (it == refCounter.end())
 654             refCounter[lp] = 1;
 655         else
 656             it->second += 1;
 657     }
 658
 659     void addReferences(const std::vector<LayerPin>& pins)
 660     {
 661         for (int i = 0; i < pins.size(); i++)
 662         {
 663             addReference(pins[i]);
 664         }
 665     }
 666
 667     // Returns number of references to allocated memory that used in specific
 668     // layer blob.
 669     int numReferences(const LayerPin& lp)
 670     {
 671         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
 672         CV_Assert(mapIt != reuseMap.end());
 673         LayerPin memHost = mapIt->second;
 674
 675         std::map<LayerPin, int>::iterator refIt = refCounter.find(memHost);
 676         CV_Assert(refIt != refCounter.end());
 677         return refIt->second;
 678     }
 679
 680     // Reuse data allocated in <host> inside the <user> blob.
 681     void reuse(const LayerPin& host, const LayerPin& user)
 682     {
 683         CV_Assert(reuseMap.find(user) == reuseMap.end());
 684         CV_Assert(reuseMap.find(host) != reuseMap.end());
 685         LayerPin memHost = reuseMap[host];
 686         reuseMap[user] = memHost;
 687         if (refCounter.find(memHost) != refCounter.end())
 688         {
 689             std::map<LayerPin, int>::iterator userRefIt = refCounter.find(user);
 690             if (userRefIt != refCounter.end())
 691             {
 692                 refCounter[memHost] += userRefIt->second;
 693                 refCounter.erase(userRefIt);
 694             }
 695             else
 696                 refCounter[memHost] += 1;
 697         }
 698     }
 699
 700     // Decrease references counter to allocated memory inside specific blob.
 701     void releaseReference(const LayerPin& lp)
 702     {
 703         std::map<LayerPin, LayerPin>::iterator mapIt = reuseMap.find(lp);
 704         CV_Assert(mapIt != reuseMap.end());
 705
 706         std::map<LayerPin, int>::iterator refIt = refCounter.find(mapIt->second);
 707         CV_Assert(refIt != refCounter.end());
 708         CV_Assert(refIt->second > 0);
 709         refIt->second -= 1;
 710     }
 711
 712     void releaseReferences(const std::vector<LayerPin>& pins)
 713     {
 714         for (int i = 0; i < pins.size(); i++)
 715         {
 716             releaseReference(pins[i]);
 717         }
 718     }
 719
 720     void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool use_half)
 721     {
 722         if (!DNN_DISABLE_MEMORY_OPTIMIZATIONS)
 723         {
 724             Mat bestBlob;
 725             LayerPin bestBlobPin;
 726
 727             std::map<LayerPin, Mat>::iterator hostIt;
 728             std::map<LayerPin, int>::iterator refIt;
 729
 730             const int targetTotal = total(shape);
 731             int bestBlobTotal = INT_MAX;
 732
 733             for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
 734             {
 735                 refIt = refCounter.find(hostIt->first);
 736                 // Use only blobs that had references before because if not,
 737                 // it might be used as output.
 738                 if (refIt != refCounter.end() && refIt->second == 0)
 739                 {
 740                     Mat& unusedBlob = hostIt->second;
 741                     if (unusedBlob.total() >= targetTotal &&
 742                         unusedBlob.total() < bestBlobTotal)
 743                     {
 744                         bestBlobPin = hostIt->first;
 745                         bestBlob = unusedBlob;
 746                         bestBlobTotal = unusedBlob.total();
 747                     }
 748                 }
 749             }
 750             if (!bestBlob.empty())
 751             {
 752                 reuse(bestBlobPin, lp);
 753                 dst = bestBlob.reshape(1, 1).colRange(0, targetTotal).reshape(1, shape);
 754                 return;
 755             }
 756         }
 757
 758         {
 759             // if dst already has been allocated with total(shape) elements,
 760             // it won't be recreated and pointer of dst.data remains the same.
 761             dst.create(shape, use_half ? CV_16S : CV_32F);
 762             addHost(lp, dst);
 763         }
 764     }
 765
 766     void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
 767                                std::vector<LayerPin>& pinsForInternalBlobs,
 768                                bool use_half = false)
 769     {
 770         CV_TRACE_FUNCTION();
 771
 772         pinsForInternalBlobs.clear();
 773
 774         std::vector<Mat>& outputBlobs = ld.outputBlobs,
 775                 &internalBlobs = ld.internals;
 776
 777         const ShapesVec& outShapes = layerShapes.out,
 778                 internalShapes = layerShapes.internal;
 779
 780         outputBlobs.resize(std::max((size_t)1, outShapes.size())); //layer produce at least one output blob
 781         internalBlobs.resize(internalShapes.size());
 782
 783         CV_Assert(ld.requiredOutputs.size() <= outShapes.size());
 784
 785         // Check that layer could work in-place.
 786         bool inPlace = false;
 787         if (layerShapes.supportInPlace)
 788         {
 789             if (ld.inputBlobs.size() == 1)
 790             {
 791                 // Get number of references to the input memory.
 792                 int numRef = numReferences(ld.inputBlobsId[0]);
 793                 // If current layer is one and only customer of this blob.
 794                 inPlace = numRef == 1;
 795             }
 796         }
 797
 798         ShapesVec shapes(outShapes);
 799         shapes.insert(shapes.end(), internalShapes.begin(), internalShapes.end());
 800         std::vector<Mat*> blobs;
 801         for(int i = 0; i < outputBlobs.size(); i++)
 802         {
 803             blobs.push_back(&outputBlobs[i]);
 804         }
 805
 806         for(int i = 0; i < internalBlobs.size(); i++)
 807         {
 808             blobs.push_back(&internalBlobs[i]);
 809             if (total(internalShapes[i]))
 810             {
 811                 pinsForInternalBlobs.push_back(LayerPin(ld.id, ld.outputBlobs.size() + i));
 812             }
 813         }
 814
 815         addReferences(pinsForInternalBlobs);
 816
 817         std::map<int, std::vector<int> > idxSizes;
 818         for(int i = 0; i < shapes.size(); i++)
 819         {
 820             idxSizes[total(shapes[i])].push_back(i);
 821         }
 822
 823         std::map<int, std::vector<int> >::reverse_iterator it;
 824         for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
 825         {
 826             for(int j = 0; j < it->second.size(); j++)
 827             {
 828                 int index = it->second[j];
 829                 if (total(shapes[index]))
 830                 {
 831                     LayerPin blobPin(ld.id, index);
 832                     if (index < outShapes.size() && inPlace)
 833                     {
 834                         CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
 835                         ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
 836                         reuse(ld.inputBlobsId[0], blobPin);
 837                     }
 838                     else
 839                         reuseOrCreate(shapes[index], blobPin, *blobs[index], use_half);
 840                 }
 841             }
 842         }
 843     }
 844
 845     // Clear internal state. Calls before an every reallocation.
 846     void reset()
 847     {
 848         CV_TRACE_FUNCTION();
 849
 850         refCounter.clear();
 851         reuseMap.clear();
 852         memHosts.clear();
 853     }
 854
 855 private:
 856     // Register allocated memory.
 857     void addHost(const LayerPin& lp, const Mat& mat)
 858     {
 859         CV_Assert(memHosts.find(lp) == memHosts.end());
 860         reuseMap[lp] = lp;
 861         memHosts[lp] = mat;
 862     }
 863
 864     std::map<LayerPin, int> refCounter;
 865     // Maps pin to origin blob (for whom memory was allocated firstly).
 866     // For origin blobs key == value.
 867     std::map<LayerPin, LayerPin> reuseMap;
 868     std::map<LayerPin, Mat> memHosts;
 869 };
 870
 871 static Ptr<BackendWrapper> wrapMat(int backendId, int targetId, cv::Mat& m)
 872 {
 873     if (backendId == DNN_BACKEND_OPENCV)
 874     {
 875         if (targetId == DNN_TARGET_CPU)
 876             return Ptr<BackendWrapper>();
 877         else if (IS_DNN_OPENCL_TARGET(targetId))
 878             return OpenCLBackendWrapper::create(m);
 879         else
 880             CV_Error(Error::StsNotImplemented, "Unknown target identifier");
 881     }
 882     else if (backendId == DNN_BACKEND_HALIDE)
 883     {
 884         CV_Assert(haveHalide());
 885 #ifdef HAVE_HALIDE
 886         return Ptr<BackendWrapper>(new HalideBackendWrapper(targetId, m));
 887 #endif  // HAVE_HALIDE
 888     }
 889     else if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
 890     {
 891         CV_Assert(haveInfEngine());
 892 #ifdef HAVE_INF_ENGINE
 893         return Ptr<BackendWrapper>(new InfEngineBackendWrapper(targetId, m));
 894 #endif  // HAVE_INF_ENGINE
 895     }
 896     else
 897         CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
 898     return Ptr<BackendWrapper>();
 899 }
 900
 901 struct Net::Impl
 902 {
 903     typedef std::map<int, LayerShapes> LayersShapesMap;
 904     typedef std::map<int, LayerData> MapIdToLayerData;
 905
 906     Impl()
 907     {
 908         //allocate fake net input layer
 909         netInputLayer = Ptr<DataLayer>(new DataLayer());
 910         LayerData &inpl = layers.insert( make_pair(0, LayerData()) ).first->second;
 911         inpl.id = 0;
 912         netInputLayer->name = inpl.name = "_input";
 913         inpl.type = "__NetInputLayer__";
 914         inpl.layerInstance = netInputLayer;
 915         layerNameToId.insert(std::make_pair(inpl.name, inpl.id));
 916
 917         lastLayerId = 0;
 918         netWasAllocated = false;
 919         fusion = true;
 920         preferableBackend = DNN_BACKEND_DEFAULT;
 921         preferableTarget = DNN_TARGET_CPU;
 922         skipInfEngineInit = false;
 923     }
 924
 925     Ptr<DataLayer> netInputLayer;
 926     std::vector<LayerPin> blobsToKeep;
 927     MapIdToLayerData layers;
 928     std::map<String, int> layerNameToId;
 929     BlobManager blobManager;
 930     int preferableBackend;
 931     int preferableTarget;
 932     String halideConfigFile;
 933     bool skipInfEngineInit;
 934     // Map host data to backend specific wrapper.
 935     std::map<void*, Ptr<BackendWrapper> > backendWrappers;
 936
 937     int lastLayerId;
 938
 939     bool netWasAllocated;
 940     bool fusion;
 941     std::vector<int64> layersTimings;
 942     Mat output_blob;
 943
 944     Ptr<BackendWrapper> wrap(Mat& host)
 945     {
 946         if (preferableBackend == DNN_BACKEND_OPENCV && preferableTarget == DNN_TARGET_CPU)
 947             return Ptr<BackendWrapper>();
 948
 949         MatShape shape(host.dims);
 950         for (int i = 0; i < host.dims; ++i)
 951             shape[i] = host.size[i];
 952
 953         void* data = host.data;
 954         if (backendWrappers.find(data) != backendWrappers.end())
 955         {
 956             Ptr<BackendWrapper> baseBuffer = backendWrappers[data];
 957             if (preferableBackend == DNN_BACKEND_OPENCV)
 958             {
 959                 CV_Assert(IS_DNN_OPENCL_TARGET(preferableTarget));
 960                 return OpenCLBackendWrapper::create(baseBuffer, host);
 961             }
 962             else if (preferableBackend == DNN_BACKEND_HALIDE)
 963             {
 964                 CV_Assert(haveHalide());
 965   #ifdef HAVE_HALIDE
 966                 return Ptr<BackendWrapper>(new HalideBackendWrapper(baseBuffer, shape));
 967   #endif  // HAVE_HALIDE
 968             }
 969             else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
 970             {
 971                 return wrapMat(preferableBackend, preferableTarget, host);
 972             }
 973             else
 974                 CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
 975         }
 976
 977         Ptr<BackendWrapper> wrapper = wrapMat(preferableBackend, preferableTarget, host);
 978         backendWrappers[data] = wrapper;
 979         return wrapper;
 980     }
 981
 982 #ifdef HAVE_HALIDE
 983     void compileHalide()
 984     {
 985         CV_TRACE_FUNCTION();
 986
 987         CV_Assert(preferableBackend == DNN_BACKEND_HALIDE);
 988
 989         HalideScheduler scheduler(halideConfigFile);
 990         std::vector< std::reference_wrapper<LayerData> > compileList; compileList.reserve(64);
 991         for (MapIdToLayerData::iterator it = layers.begin(); it != layers.end(); ++it)
 992         {
 993             LayerData &ld = it->second;
 994             Ptr<Layer> layer = ld.layerInstance;
 995             if (layer->supportBackend(DNN_BACKEND_HALIDE) && !ld.skip)
 996             {
 997                 CV_Assert(!ld.backendNodes[DNN_BACKEND_HALIDE].empty());
 998                 bool scheduled = scheduler.process(ld.backendNodes[DNN_BACKEND_HALIDE]);
 999                 if (!scheduled)
1000                 {
1001                     // Use automatic scheduling provided by layer.
1002                     layer->applyHalideScheduler(ld.backendNodes[DNN_BACKEND_HALIDE],
1003                                                 ld.inputBlobs, ld.outputBlobs,
1004                                                 preferableTarget);
1005                 }
1006                 compileList.emplace_back(ld);
1007             }
1008         }
1009         std::atomic<int> progress(0);
1010         auto fn = ([&] () -> void
1011         {
1012             for (;;)
1013             {
1014                 int id = progress.fetch_add(1);
1015                 if ((size_t)id >= compileList.size())
1016                     return;
1017                 const LayerData& ld = compileList[id].get();
1018                 Ptr<BackendNode> node = ld.backendNodes.find(DNN_BACKEND_HALIDE)->second;
1019                 dnn::compileHalide(ld.outputBlobs, node, preferableTarget);
1020             }
1021         });
1022         size_t num_threads = std::min(compileList.size(), (size_t)std::thread::hardware_concurrency());
1023         num_threads = std::max((size_t)1u, std::min((size_t)8u, num_threads));
1024         std::vector<std::thread> threads(num_threads - 1);
1025         for (auto& t: threads) t = std::thread(fn);
1026         fn(); // process own tasks
1027         for (auto& t: threads) t.join();
1028     }
1029 #endif
1030
1031     void clear()
1032     {
1033         CV_TRACE_FUNCTION();
1034
1035         MapIdToLayerData::iterator it;
1036         for (it = layers.begin(); it != layers.end(); it++)
1037         {
1038             if (it->second.id != 0) {
1039                 it->second.inputBlobs.clear();
1040                 it->second.outputBlobs.clear();
1041                 it->second.internals.clear();
1042             }
1043             it->second.skip = false;
1044             //it->second.consumers.clear();
1045             Ptr<Layer> currLayer = it->second.layerInstance;
1046
1047             if( currLayer.empty() )
1048                 continue;
1049
1050             currLayer->unsetAttached();
1051
1052             Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
1053             if( !poolingLayer.empty() )
1054             {
1055                 poolingLayer->computeMaxIdx = true;
1056             }
1057         }
1058
1059         layersTimings.clear();
1060     }
1061
1062     void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
1063     {
1064         CV_TRACE_FUNCTION();
1065
1066         if (preferableBackend == DNN_BACKEND_DEFAULT)
1067             preferableBackend = (Backend)PARAM_DNN_BACKEND_DEFAULT;
1068
1069         CV_Assert(preferableBackend != DNN_BACKEND_OPENCV ||
1070                   preferableTarget == DNN_TARGET_CPU ||
1071                   preferableTarget == DNN_TARGET_OPENCL ||
1072                   preferableTarget == DNN_TARGET_OPENCL_FP16);
1073         CV_Assert(preferableBackend != DNN_BACKEND_HALIDE ||
1074                   preferableTarget == DNN_TARGET_CPU ||
1075                   preferableTarget == DNN_TARGET_OPENCL);
1076         CV_Assert(preferableBackend != DNN_BACKEND_INFERENCE_ENGINE ||
1077                   preferableTarget == DNN_TARGET_CPU ||
1078                   preferableTarget == DNN_TARGET_OPENCL ||
1079                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
1080                   preferableTarget == DNN_TARGET_MYRIAD);
1081         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
1082         {
1083             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
1084 #ifndef HAVE_OPENCL
1085             {
1086                 CV_LOG_WARNING(NULL, "DNN: OpenCL target is not available in this OpenCV build, switching to CPU.");
1087                 preferableTarget = DNN_TARGET_CPU;
1088             }
1089 #else
1090             {
1091                 if (!DNN_OPENCL_ALLOW_ALL_DEVICES)
1092                 {
1093                     // Current implementation is only valid for GPU (#11494)
1094                     if (ocl::Device::getDefault().type() != ocl::Device::TYPE_GPU)
1095                     {
1096                         CV_LOG_WARNING(NULL, "DNN: OpenCL target is not supported with current OpenCL device (tested with GPUs only), switching to CPU.");
1097                         preferableTarget = DNN_TARGET_CPU;
1098                     }
1099                     else if (preferableTarget == DNN_TARGET_OPENCL_FP16 && !ocl::Device::getDefault().isIntel())
1100                     {
1101                         CV_LOG_WARNING(NULL,
1102                             "DNN: OpenCL target with fp16 precision is not supported "
1103                             "with current OpenCL device (tested with Intel GPUs only), "
1104                             "switching to OpenCL with fp32 precision.");
1105                         preferableTarget = DNN_TARGET_OPENCL;
1106                     }
1107                 }
1108             }
1109 #endif
1110             clear();
1111
1112             allocateLayers(blobsToKeep_);
1113
1114             MapIdToLayerData::iterator it = layers.find(0);
1115             CV_Assert(it != layers.end());
1116             it->second.skip = netInputLayer->skip;
1117
1118             initBackend();
1119
1120             if (!netWasAllocated )
1121             {
1122 #ifdef HAVE_HALIDE
1123                 if (preferableBackend == DNN_BACKEND_HALIDE)
1124                     compileHalide();
1125 #else
1126                 CV_Assert(preferableBackend != DNN_BACKEND_HALIDE);
1127 #endif
1128             }
1129
1130             netWasAllocated = true;
1131             this->blobsToKeep = blobsToKeep_;
1132         }
1133     }
1134
1135     int getLayerId(const String &layerName)
1136     {
1137         std::map<String, int>::iterator it = layerNameToId.find(layerName);
1138         return (it != layerNameToId.end()) ? it->second : -1;
1139     }
1140
1141     int getLayerId(int id)
1142     {
1143         MapIdToLayerData::iterator it = layers.find(id);
1144         return (it != layers.end()) ? id : -1;
1145     }
1146
1147     int getLayerId(DictValue &layerDesc)
1148     {
1149         if (layerDesc.isInt())
1150             return getLayerId(layerDesc.get<int>());
1151         else if (layerDesc.isString())
1152             return getLayerId(layerDesc.get<String>());
1153
1154         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1155         return -1;
1156     }
1157
1158     String getLayerName(int id)
1159     {
1160         MapIdToLayerData::iterator it = layers.find(id);
1161         return (it != layers.end()) ? it->second.name : "(unknown layer)";
1162     }
1163
1164     LayerData& getLayerData(int id)
1165     {
1166         MapIdToLayerData::iterator it = layers.find(id);
1167
1168         if (it == layers.end())
1169             CV_Error(Error::StsObjectNotFound, format("Layer with requested id=%d not found", id));
1170
1171         return it->second;
1172     }
1173
1174     LayerData& getLayerData(const String &layerName)
1175     {
1176         int id = getLayerId(layerName);
1177
1178         if (id < 0)
1179             CV_Error(Error::StsError, "Requested layer \"" + layerName + "\" not found");
1180
1181         return getLayerData(id);
1182     }
1183
1184     LayerData& getLayerData(const DictValue &layerDesc)
1185     {
1186         CV_Assert(layerDesc.isInt() || layerDesc.isString());
1187         if (layerDesc.isInt())
1188             return getLayerData(layerDesc.get<int>());
1189         else /*if (layerDesc.isString())*/
1190             return getLayerData(layerDesc.get<String>());
1191     }
1192
1193     static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
1194     {
1195         if ((int)ld.inputBlobsId.size() <= inNum)
1196         {
1197             ld.inputBlobsId.resize(inNum + 1);
1198         }
1199         else
1200         {
1201             LayerPin storedFrom = ld.inputBlobsId[inNum];
1202             if (storedFrom.valid() && !storedFrom.equal(from))
1203                 CV_Error(Error::StsError, format("Input #%d of layer \"%s\" already was connected",
1204                                                  inNum, ld.name.c_str()));
1205         }
1206
1207         ld.inputBlobsId[inNum] = from;
1208     }
1209
1210     int resolvePinOutputName(LayerData &ld, const String &outName)
1211     {
1212         if (outName.empty())
1213             return 0;
1214         return ld.getLayerInstance()->outputNameToIndex(outName);
1215     }
1216
1217     LayerPin getPinByAlias(const String &layerName)
1218     {
1219         LayerPin pin;
1220         pin.lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1221
1222         if (pin.lid >= 0)
1223             pin.oid = resolvePinOutputName(getLayerData(pin.lid), layerName);
1224
1225         return pin;
1226     }
1227
1228     std::vector<LayerPin> getLayerOutPins(const String &layerName)
1229     {
1230         int lid = (layerName.empty()) ? 0 : getLayerId(layerName);
1231
1232         std::vector<LayerPin> pins;
1233
1234         for (int i = 0; i < layers[lid].outputBlobs.size(); i++)
1235         {
1236             pins.push_back(LayerPin(lid, i));
1237         }
1238
1239         return pins;
1240     }
1241
1242     void connect(int outLayerId, int outNum, int inLayerId, int inNum)
1243     {
1244         CV_Assert(outLayerId < inLayerId);
1245         LayerData &ldOut = getLayerData(outLayerId);
1246         LayerData &ldInp = getLayerData(inLayerId);
1247
1248         addLayerInput(ldInp, inNum, LayerPin(outLayerId, outNum));
1249         ldOut.requiredOutputs.insert(outNum);
1250         ldOut.consumers.push_back(LayerPin(inLayerId, outNum));
1251     }
1252
1253     void initBackend()
1254     {
1255         CV_TRACE_FUNCTION();
1256         if (preferableBackend == DNN_BACKEND_OPENCV)
1257             CV_Assert(preferableTarget == DNN_TARGET_CPU || IS_DNN_OPENCL_TARGET(preferableTarget));
1258         else if (preferableBackend == DNN_BACKEND_HALIDE)
1259             initHalideBackend();
1260         else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
1261             initInfEngineBackend();
1262         else
1263             CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
1264     }
1265
1266     void initHalideBackend()
1267     {
1268         CV_TRACE_FUNCTION();
1269         CV_Assert_N(preferableBackend == DNN_BACKEND_HALIDE, haveHalide());
1270
1271         // Iterator to current layer.
1272         MapIdToLayerData::iterator it = layers.begin();
1273         // Iterator to base layer for fusion. In example, in case of conv+bn+relu
1274         // it'll be a conv layer.
1275         MapIdToLayerData::iterator baseIt = layers.begin();
1276         for (; it != layers.end(); it++)
1277         {
1278             LayerData &ldTop = it->second;
1279             Ptr<Layer> layerTop = ldTop.layerInstance;
1280             if (!layerTop->supportBackend(preferableBackend))
1281             {
1282                 // Move base iterator to layer that don't support preferable
1283                 // backend to prevent fusion over layer of different backend.
1284                 baseIt = it;
1285                 continue;
1286             }
1287             // Try to do layers fusion.
1288             LayerData &ldBot = baseIt->second;
1289             Ptr<Layer> layerBot = ldBot.layerInstance;
1290             // 1. Check that bottom and top from the same backends.
1291             if (it != layers.begin() && layerBot->supportBackend(preferableBackend))
1292             {
1293                 // 2. Check that current layer works in-place.
1294                 bool inPlace = ldTop.inputBlobs.size() == 1 &&
1295                                ldBot.outputBlobs.size() == 1 &&
1296                                ldTop.inputBlobs[0]->data ==
1297                                ldBot.outputBlobs[0].data;
1298                 if (inPlace)
1299                 {
1300                     // 3. Try to attach node.
1301                     CV_Assert(!ldBot.backendNodes[preferableBackend].empty());
1302                     Ptr<BackendNode> fusedNode =
1303                         layerTop->tryAttach(ldBot.backendNodes[preferableBackend]);
1304                     if (!fusedNode.empty())
1305                     {
1306                         ldTop.skip = true;
1307                         ldBot.backendNodes[preferableBackend] = fusedNode;
1308                         ldBot.outputBlobsWrappers = ldTop.outputBlobsWrappers;
1309                         continue;
1310                     }
1311                 }
1312             }
1313             // No layers fusion.
1314             ldTop.skip = false;
1315             ldTop.backendNodes[DNN_BACKEND_HALIDE] =
1316                 layerTop->initHalide(ldTop.inputBlobsWrappers);
1317             baseIt = it;
1318         }
1319     }
1320
1321 #ifdef HAVE_INF_ENGINE
1322     // Before launching Inference Engine graph we need to specify output blobs.
1323     // This function requests output blobs based on inputs references of
1324     // layers from default backend or layers from different graphs.
1325     void addInfEngineNetOutputs(LayerData &ld)
1326     {
1327         Ptr<InfEngineBackendNet> layerNet;
1328         if (ld.backendNodes.find(preferableBackend) != ld.backendNodes.end())
1329         {
1330             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1331             if (!node.empty())
1332             {
1333                 Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1334                 CV_Assert(!ieNode.empty()); CV_Assert(!ieNode->net.empty());
1335                 layerNet = ieNode->net;
1336             }
1337         }
1338         // For an every input reference we check that it belongs to one of
1339         // the Inference Engine backend graphs. Request an output blob if it is.
1340         // Do nothing if layer's input is from the same graph.
1341         for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1342         {
1343             LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1344             Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1345             if (!inpNode.empty())
1346             {
1347                 Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1348                 CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1349                 if (layerNet != ieInpNode->net)
1350                 {
1351                     // layerNet is empty or nodes are from different graphs.
1352                     ieInpNode->net->addOutput(ieInpNode->layer->name);
1353                 }
1354             }
1355         }
1356     }
1357 #endif  // HAVE_INF_ENGINE
1358
1359     void initInfEngineBackend()
1360     {
1361         CV_TRACE_FUNCTION();
1362         CV_Assert_N(preferableBackend == DNN_BACKEND_INFERENCE_ENGINE, haveInfEngine());
1363 #ifdef HAVE_INF_ENGINE
1364         MapIdToLayerData::iterator it;
1365         Ptr<InfEngineBackendNet> net;
1366
1367         for (it = layers.begin(); it != layers.end(); ++it)
1368         {
1369             LayerData &ld = it->second;
1370             if (ld.id == 0)
1371             {
1372                 CV_Assert((netInputLayer->outNames.empty() && ld.outputBlobsWrappers.size() == 1) ||
1373                           (netInputLayer->outNames.size() == ld.outputBlobsWrappers.size()));
1374                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1375                 {
1376                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1377                     dataPtr->name = netInputLayer->outNames.empty() ? ld.name : netInputLayer->outNames[i];
1378                 }
1379             }
1380             else
1381             {
1382                 for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1383                 {
1384                     InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1385                     dataPtr->name = ld.name;
1386                 }
1387             }
1388         }
1389
1390         if (skipInfEngineInit)
1391         {
1392             Ptr<BackendNode> node = layers[lastLayerId].backendNodes[preferableBackend];
1393             CV_Assert(!node.empty());
1394
1395             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1396             CV_Assert(!ieNode.empty());
1397
1398             for (it = layers.begin(); it != layers.end(); ++it)
1399             {
1400                 LayerData &ld = it->second;
1401                 if (ld.id == 0)
1402                 {
1403                     for (int i = 0; i < ld.inputBlobsWrappers.size(); ++i)
1404                     {
1405                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.inputBlobsWrappers[i]);
1406                         dataPtr->name = netInputLayer->outNames[i];
1407                     }
1408                 }
1409                 else
1410                 {
1411                     for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
1412                     {
1413                         InferenceEngine::DataPtr dataPtr = infEngineDataNode(ld.outputBlobsWrappers[i]);
1414                         dataPtr->name = ld.name;
1415                     }
1416                 }
1417                 ieNode->net->addBlobs(ld.inputBlobsWrappers);
1418                 ieNode->net->addBlobs(ld.outputBlobsWrappers);
1419                 ld.skip = true;
1420             }
1421             layers[lastLayerId].skip = false;
1422             ieNode->net->init(preferableTarget);
1423             return;
1424         }
1425
1426         // Build Inference Engine networks from sets of layers that support this
1427         // backend. Split a whole model on several Inference Engine networks if
1428         // some of layers is not implemented.
1429
1430         // Set of all input and output blobs wrappers for current network.
1431         std::map<LayerPin, Ptr<BackendWrapper> > netBlobsWrappers;
1432         for (it = layers.begin(); it != layers.end(); ++it)
1433         {
1434             LayerData &ld = it->second;
1435             if (ld.id == 0 && ld.skip)
1436                 continue;
1437             bool fused = ld.skip;
1438
1439             Ptr<Layer> layer = ld.layerInstance;
1440             if (!fused && !layer->supportBackend(preferableBackend))
1441             {
1442                 addInfEngineNetOutputs(ld);
1443                 net = Ptr<InfEngineBackendNet>();
1444                 netBlobsWrappers.clear();
1445                 layer->preferableTarget = DNN_TARGET_CPU;
1446                 continue;
1447             }
1448             ld.skip = true;  // Initially skip all Inference Engine supported layers.
1449
1450             // Create a new network if one of inputs from different Inference Engine graph.
1451             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1452             {
1453                 LayerData &inpLd = layers[ld.inputBlobsId[i].lid];
1454                 Ptr<BackendNode> inpNode = inpLd.backendNodes[preferableBackend];
1455                 if (!inpNode.empty())
1456                 {
1457                     Ptr<InfEngineBackendNode> ieInpNode = inpNode.dynamicCast<InfEngineBackendNode>();
1458                     CV_Assert(!ieInpNode.empty()); CV_Assert(!ieInpNode->net.empty());
1459                     if (ieInpNode->net != net)
1460                     {
1461                         net = Ptr<InfEngineBackendNet>();
1462                         netBlobsWrappers.clear();
1463                         break;
1464                     }
1465                 }
1466             }
1467
1468             // The same blobs wrappers cannot be shared between two Inference Engine
1469             // networks because of explicit references between layers and blobs.
1470             // So we need to rewrap all the external blobs.
1471             for (int i = 0; i < ld.inputBlobsId.size(); ++i)
1472             {
1473                 LayerPin inPin = ld.inputBlobsId[i];
1474                 auto it = netBlobsWrappers.find(inPin);
1475                 if (it == netBlobsWrappers.end())
1476                 {
1477                     ld.inputBlobsWrappers[i] = InfEngineBackendWrapper::create(ld.inputBlobsWrappers[i]);
1478                     netBlobsWrappers[inPin] = ld.inputBlobsWrappers[i];
1479                 }
1480                 else
1481                     ld.inputBlobsWrappers[i] = it->second;
1482             }
1483             netBlobsWrappers[LayerPin(ld.id, 0)] = ld.outputBlobsWrappers[0];
1484
1485             Ptr<BackendNode> node;
1486             if (!net.empty())
1487             {
1488                 if (fused)
1489                 {
1490                     bool inPlace = ld.inputBlobsId.size() == 1 && ld.outputBlobs.size() == 1 &&
1491                                    ld.inputBlobs[0]->data == ld.outputBlobs[0].data;
1492                     CV_Assert(inPlace);
1493                     node = layers[ld.inputBlobsId[0].lid].backendNodes[preferableBackend];
1494                     ld.inputBlobsWrappers = layers[ld.inputBlobsId[0].lid].inputBlobsWrappers;
1495                 }
1496             }
1497             else
1498                 net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet());
1499
1500             if (!fused)
1501             {
1502                 node = layer->initInfEngine(ld.inputBlobsWrappers);
1503             }
1504             else if (node.empty())
1505                 continue;
1506
1507             CV_Assert(!node.empty());
1508             ld.backendNodes[preferableBackend] = node;
1509
1510             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1511             CV_Assert(!ieNode.empty());
1512             ieNode->net = net;
1513
1514             auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
1515             if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD) && !fused)
1516             {
1517                 ieNode->layer->precision = InferenceEngine::Precision::FP16;
1518                 if (weightableLayer)
1519                 {
1520                     if (weightableLayer->_weights)
1521                         weightableLayer->_weights = convertFp16(weightableLayer->_weights);
1522                     if (weightableLayer->_biases)
1523                         weightableLayer->_biases = convertFp16(weightableLayer->_biases);
1524                 }
1525                 else
1526                 {
1527                     for (const auto& weights : {"weights", "biases"})
1528                     {
1529                         auto it = ieNode->layer->blobs.find(weights);
1530                         if (it != ieNode->layer->blobs.end())
1531                             it->second = convertFp16(it->second);
1532                     }
1533                 }
1534             }
1535             if (weightableLayer)
1536             {
1537                 if (weightableLayer->_weights)
1538                     weightableLayer->blobs["weights"] = weightableLayer->_weights;
1539                 if (weightableLayer->_biases)
1540                     weightableLayer->blobs["biases"] = weightableLayer->_biases;
1541             }
1542             ieNode->connect(ld.inputBlobsWrappers, ld.outputBlobsWrappers);
1543             net->addBlobs(ld.inputBlobsWrappers);
1544             net->addBlobs(ld.outputBlobsWrappers);
1545
1546             if (!fused)
1547                 net->addLayer(ieNode->layer);
1548             addInfEngineNetOutputs(ld);
1549         }
1550
1551         // Initialize all networks.
1552         std::set<InfEngineBackendNet> initializedNets;
1553         for (MapIdToLayerData::reverse_iterator it = layers.rbegin(); it != layers.rend(); ++it)
1554         {
1555             LayerData &ld = it->second;
1556             if (ld.backendNodes.find(preferableBackend) == ld.backendNodes.end())
1557                 continue;
1558
1559             Ptr<BackendNode> node = ld.backendNodes[preferableBackend];
1560             if (node.empty())
1561                 continue;
1562
1563             Ptr<InfEngineBackendNode> ieNode = node.dynamicCast<InfEngineBackendNode>();
1564             if (ieNode.empty())
1565                 continue;
1566
1567             CV_Assert(!ieNode->net.empty());
1568
1569             if (!ieNode->net->isInitialized())
1570             {
1571                 ieNode->net->init(preferableTarget);
1572                 ld.skip = false;
1573             }
1574         }
1575 #endif  // HAVE_INF_ENGINE
1576     }
1577
1578     void allocateLayer(int lid, const LayersShapesMap& layersShapes)
1579     {
1580         CV_TRACE_FUNCTION();
1581
1582         LayerData &ld = layers[lid];
1583
1584         //already allocated
1585         if (ld.flag)
1586             return;
1587
1588         size_t ninputs = ld.inputBlobsId.size();
1589 #if 0
1590         printf("layer %s:", ld.name.c_str());
1591         for (size_t i = 0; i < ninputs; i++)
1592         {
1593             int inp_lid = ld.inputBlobsId[i].lid;
1594             LayerData &inp_ld = layers[inp_lid];
1595             int inp_outputs = (int)inp_ld.outputBlobs.size();
1596             std::cout << " " << inp_ld.name << "(" << inp_outputs;
1597
1598             for( int j = 0; j < inp_outputs; j++ )
1599             {
1600                 std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
1601             }
1602             std::cout << ")";
1603         }
1604         printf("\n");
1605 #endif
1606
1607         //determine parent layers
1608         for (size_t i = 0; i < ninputs; i++)
1609             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
1610
1611         //allocate parents
1612         for (set<int>::iterator i = ld.inputLayersId.begin(); i != ld.inputLayersId.end(); i++)
1613             allocateLayer(*i, layersShapes);
1614
1615         //bind inputs
1616         if (ld.id == 0)  // DataLayer
1617         {
1618             ninputs = netInputLayer->inputsData.size();
1619             ld.inputBlobsWrappers.resize(ninputs);
1620             for (size_t i = 0; i < ninputs; i++)
1621             {
1622                 ld.inputBlobsWrappers[i] = wrap(netInputLayer->inputsData[i]);
1623             }
1624         }
1625         else
1626         {
1627             ld.inputBlobs.resize(ninputs);
1628             ld.inputBlobsWrappers.resize(ninputs);
1629             for (size_t i = 0; i < ninputs; i++)
1630             {
1631                 LayerPin from = ld.inputBlobsId[i];
1632                 CV_Assert(from.valid());
1633                 CV_DbgAssert(layers.count(from.lid) && (int)layers[from.lid].outputBlobs.size() > from.oid);
1634                 ld.inputBlobs[i] = &layers[from.lid].outputBlobs[from.oid];
1635                 ld.inputBlobsWrappers[i] = layers[from.lid].outputBlobsWrappers[from.oid];
1636             }
1637         }
1638
1639         LayersShapesMap::const_iterator layerShapesIt = layersShapes.find(lid);
1640
1641         CV_Assert(layerShapesIt != layersShapes.end());
1642
1643         std::vector<LayerPin> pinsForInternalBlobs;
1644         blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs,
1645                                           preferableBackend == DNN_BACKEND_OPENCV &&
1646                                           preferableTarget == DNN_TARGET_OPENCL_FP16);
1647         ld.outputBlobsWrappers.resize(ld.outputBlobs.size());
1648         for (int i = 0; i < ld.outputBlobs.size(); ++i)
1649         {
1650             ld.outputBlobsWrappers[i] = wrap(ld.outputBlobs[i]);
1651         }
1652         ld.internalBlobsWrappers.resize(ld.internals.size());
1653         for (int i = 0; i < ld.internals.size(); ++i)
1654         {
1655             ld.internalBlobsWrappers[i] = wrap(ld.internals[i]);
1656         }
1657
1658         Ptr<Layer> layerPtr = ld.getLayerInstance();
1659         {
1660             std::vector<Mat> inps(ld.inputBlobs.size());
1661             for (int i = 0; i < ld.inputBlobs.size(); ++i)
1662             {
1663                 inps[i] = *ld.inputBlobs[i];
1664             }
1665             layerPtr->finalize(inps, ld.outputBlobs);
1666             layerPtr->preferableTarget = preferableTarget;
1667 #if 0
1668             std::cout << "\toutputs:";
1669             size_t noutputs = ld.outputBlobs.size();
1670             for (size_t j = 0; j < noutputs; j++)
1671             {
1672                 std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
1673             }
1674             std::cout << "\n";
1675 #endif
1676         }
1677
1678         // After allocation of layer, we decrease counters to it's input blobs.
1679         blobManager.releaseReferences(ld.inputBlobsId);
1680         blobManager.releaseReferences(pinsForInternalBlobs);
1681
1682         ld.flag = 1;
1683     }
1684
1685 #if 0
1686 #define printf_(args) printf args
1687 #else
1688 #define printf_(args)
1689 #endif
1690
1691     void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
1692     {
1693         if( !fusion || preferableBackend != DNN_BACKEND_OPENCV &&
1694                        preferableBackend != DNN_BACKEND_INFERENCE_ENGINE)
1695             return;
1696
1697         CV_TRACE_FUNCTION();
1698
1699         // scan through all the layers. If there is convolution layer followed by the activation layer,
1700         // we try to embed this activation into the convolution and disable separate execution of the activation
1701         std::set<LayerPin> pinsToKeep(blobsToKeep_.begin(),
1702                                       blobsToKeep_.end());
1703         MapIdToLayerData::iterator it;
1704         for (it = layers.begin(); it != layers.end(); it++)
1705         {
1706             int lid = it->first;
1707             LayerData& ld = layers[lid];
1708             if( ld.skip )
1709             {
1710                 printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1711                 continue;
1712             }
1713             printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
1714
1715             // the optimization #1. try to fuse batch norm, scaling and/or activation layers
1716             // with the current layer if they follow it. Normally, the are fused with the convolution layer,
1717             // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
1718             // some other layers.
1719             Ptr<Layer>& currLayer = ld.layerInstance;
1720             if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
1721             {
1722                 LayerData* nextData = &layers[ld.consumers[0].lid];
1723                 LayerPin lpNext(ld.consumers[0].lid, 0);
1724                 while (nextData)
1725                 {
1726                     Ptr<Layer> nextLayer = nextData->layerInstance;
1727                     if (currLayer->tryFuse(nextLayer))
1728                     {
1729                         printf_(("\tfused with %s\n", nextLayer->name.c_str()));
1730                         nextData->skip = true;
1731                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1732                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1733                         if (nextData->consumers.size() == 1)
1734                         {
1735                             int nextLayerId = nextData->consumers[0].lid;
1736                             nextData = &layers[nextLayerId];
1737                             lpNext = LayerPin(nextLayerId, 0);
1738                         }
1739                         else
1740                         {
1741                             nextData = 0;
1742                             break;
1743                         }
1744                     }
1745                     else
1746                         break;
1747                 }
1748
1749                 if (preferableBackend != DNN_BACKEND_OPENCV)
1750                     continue;  // Go to the next layer.
1751
1752                 // TODO: OpenCL target support more fusion styles.
1753                 if ( preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget) &&
1754                      (!cv::ocl::useOpenCL() || (ld.layerInstance->type != "Convolution" &&
1755                      ld.layerInstance->type != "MVN" && ld.layerInstance->type != "Pooling" &&
1756                      ld.layerInstance->type != "Concat")) )
1757                     continue;
1758
1759                 while (nextData)
1760                 {
1761                     // For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
1762                     if (IS_DNN_OPENCL_TARGET(preferableTarget) &&
1763                         nextData->type != "ReLU" &&
1764                         nextData->type != "ChannelsPReLU" &&
1765                         nextData->type != "ReLU6" &&
1766                         nextData->type != "TanH" &&
1767                         nextData->type != "Power")
1768                         break;
1769
1770                     Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1771                     if (nextActivLayer.empty())
1772                         break;
1773
1774                     if (currLayer->setActivation(nextActivLayer))
1775                     {
1776                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1777                         nextData->skip = true;
1778                         ld.outputBlobs = layers[lpNext.lid].outputBlobs;
1779                         ld.outputBlobsWrappers = layers[lpNext.lid].outputBlobsWrappers;
1780                         if (nextData->consumers.size() == 1)
1781                         {
1782                             int nextLayerId = nextData->consumers[0].lid;
1783                             nextData = &layers[nextLayerId];
1784                             lpNext = LayerPin(nextLayerId, 0);
1785                         }
1786                         else
1787                         {
1788                             nextData = 0;
1789                             break;
1790                         }
1791                     }
1792                     else
1793                         break;
1794                 }
1795
1796                 // fuse convolution layer followed by eltwise + relu
1797                 if ( IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution" )
1798                 {
1799                     Ptr<EltwiseLayer> nextEltwiseLayer;
1800                     if( nextData )
1801                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
1802
1803                     if( !nextEltwiseLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
1804                         nextData && nextData->inputBlobsId.size() == 2 )
1805                     {
1806                         LayerData *eltwiseData = nextData;
1807
1808                         // Eltwise layer has two inputs. We need to determine which
1809                         // is a base convolution layer and which could be used as it's bias.
1810                         LayerData* biasLayerData = 0;
1811                         for (int i = 0; i < 2; ++i)
1812                         {
1813                             LayerData *downLayerData = &layers[eltwiseData->inputBlobsId[i].lid];
1814                             CV_Assert(downLayerData);
1815                             while (downLayerData->skip)
1816                             {
1817                                 if (downLayerData->inputBlobsId.size() == 1)
1818                                     downLayerData = &layers[downLayerData->inputBlobsId[0].lid];
1819                                 else
1820                                 {
1821                                     downLayerData = 0;
1822                                     break;
1823                                 }
1824                             }
1825                             if (downLayerData && ld.id == downLayerData->id)
1826                             {
1827                                 biasLayerData = &layers[eltwiseData->inputBlobsId[1 - i].lid];
1828                                 break;
1829                             }
1830                         }
1831                         CV_Assert(biasLayerData);
1832                         {
1833                             if( eltwiseData->consumers.size() == 1 )
1834                             {
1835                                 // fuse eltwise + activation layer
1836                                 if (biasLayerData->id < ld.id)
1837                                 {
1838                                     nextData = &layers[eltwiseData->consumers[0].lid];
1839                                     lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
1840                                     Ptr<ActivationLayer> nextActivLayer;
1841                                     if( nextData )
1842                                         nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
1843
1844                                     if( !nextActivLayer.empty() && pinsToKeep.count(lpNext) == 0 &&
1845                                             (!nextData->type.compare("ReLU") ||
1846                                              !nextData->type.compare("ChannelsPReLU") ||
1847                                              !nextData->type.compare("Power")) &&
1848                                             currLayer->setActivation(nextActivLayer) )
1849                                     {
1850                                         CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
1851                                         ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
1852                                         printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
1853                                         printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
1854                                         eltwiseData->skip = true;
1855                                         nextData->skip = true;
1856                                         // This optimization for cases like
1857                                         // some_layer   conv
1858                                         //   |             |
1859                                         //   +-- eltwise --+
1860                                         //          |
1861                                         //        activ
1862                                         // This way all the element-wise computations
1863                                         // (i.e. some_layer+conv or some_layer*conv)
1864                                         // would be done at [conv] layer. So we need to
1865                                         // replace [conv]'s output blob to [eltwise]'s one
1866                                         // considering that [activ] is an in-place layer.
1867                                         // Also we need to move all the consumers' references.
1868                                         // To prevent memory collisions (i.e. when input of
1869                                         // [conv] and output of [eltwise] is the same blob)
1870                                         // we allocate a new blob.
1871                                         CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
1872                                         ld.outputBlobs[0] = ld.outputBlobs[0].clone();
1873                                         ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
1874
1875                                         eltwiseData->outputBlobs = ld.outputBlobs;
1876                                         nextData->outputBlobs = ld.outputBlobs;
1877                                         eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
1878                                         nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
1879
1880                                         // Move references of [activ] layer consumers to the newly allocated blob.
1881                                         for (int i = 0; i < nextData->consumers.size(); ++i)
1882                                         {
1883                                             LayerData& consumer = layers[nextData->consumers[i].lid];
1884                                             for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
1885                                             {
1886                                                 if (consumer.inputBlobsId[j].lid == lpNext.lid)
1887                                                 {
1888                                                     consumer.inputBlobs[j] = &ld.outputBlobs[0];
1889                                                     consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
1890                                                     break;
1891                                                 }
1892                                             }
1893                                         }
1894                                     }
1895                                 }
1896                             }
1897                         }
1898                     }
1899                 }
1900             }
1901
1902             if (preferableBackend != DNN_BACKEND_OPENCV)
1903                 continue;  // Go to the next layer.
1904
1905             // the optimization #2. if there is no layer that takes max pooling layer's computed
1906             // max indices (and only some semantical segmentation networks might need this;
1907             // many others only take the maximum values), then we switch the max pooling
1908             // layer to the faster operating mode.
1909             Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
1910             if( !poolingLayer.empty() && !ld.consumers.empty() )
1911             {
1912                 size_t i = 0, nconsumers = ld.consumers.size();
1913                 for( ; i < nconsumers; i++ )
1914                     if( ld.consumers[i].oid > 0 )
1915                         break;
1916                 // if there is no layer that takes the second output pin of the pooling layer
1917                 // on input then we don't need to compute the indices
1918                 if( i >= nconsumers )
1919                 {
1920                     poolingLayer->computeMaxIdx = false;
1921                     printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
1922                 }
1923             }
1924
1925             // the optimization #3. if there is concat layer that concatenates channels
1926             // from the inputs together (i.e. axis == 1) then we make the inputs of
1927             // the concat layer to write to the concatenation output buffer
1928             // (and so we eliminate the concatenation layer, because the channels
1929             // are concatenated implicitly).
1930             Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
1931             if( !concatLayer.empty() && concatLayer->axis == 1 && !concatLayer->padding &&
1932                 ld.outputBlobs.size() == 1 )
1933             {
1934                 Mat& output = ld.outputBlobs[0];
1935                 UMat umat_output;
1936                 if (!ld.outputBlobsWrappers.empty() &&
1937                     (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget)))
1938                 {
1939                     size_t i, ninputs = ld.inputBlobsId.size();
1940                     bool conv_layer = true;
1941                     for( i = 0; i < ninputs; i++ )
1942                     {
1943                         LayerPin pin = ld.inputBlobsId[i];
1944                         LayerData* inp_i_data = &layers[pin.lid];
1945                         while(inp_i_data->skip &&
1946                               inp_i_data->inputBlobsId.size() == 1 &&
1947                               inp_i_data->consumers.size() == 1)
1948                         {
1949                             pin = inp_i_data->inputBlobsId[0];
1950                             inp_i_data = &layers[pin.lid];
1951                         }
1952                         conv_layer = conv_layer && (inp_i_data->getLayerInstance()->type == "Convolution");
1953                     }
1954                     if (!conv_layer)
1955                         continue;
1956                     std::vector<UMat> umat_outputBlobs;
1957                     umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
1958                     umat_output = umat_outputBlobs[0];
1959                 }
1960
1961                 // TODO: in general, this optimization can always be done, but
1962                 // many layers currently check that the input/output blobs are
1963                 // continuous arrays. Unfortunately, this is not true when
1964                 // the concatenation optimization is applied with batch_size > 1.
1965                 // so, for now, we only apply this optimization in the most popular
1966                 // case batch_size == 1.
1967                 if( output.dims == 4 && output.size[0] == 1 )
1968                 {
1969                     size_t i, ninputs = ld.inputBlobsId.size();
1970                     std::vector<LayerPin> realinputs(ninputs);
1971                     for( i = 0; i < ninputs; i++ )
1972                     {
1973                         LayerPin pin = ld.inputBlobsId[i];
1974                         LayerData* inp_i_data = &layers[pin.lid];
1975                         while(inp_i_data->skip &&
1976                               inp_i_data->inputBlobsId.size() == 1 &&
1977                               inp_i_data->consumers.size() == 1)
1978                         {
1979                             pin = inp_i_data->inputBlobsId[0];
1980                             inp_i_data = &layers[pin.lid];
1981                         }
1982                         printf_(("\treal input for %s is %s\n",
1983                                layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
1984                                inp_i_data->getLayerInstance()->name.c_str()));
1985
1986                         if(inp_i_data->skip || inp_i_data->consumers.size() != 1)
1987                             break;
1988                         realinputs[i] = pin;
1989                     }
1990
1991                     if( i >= ninputs )
1992                     {
1993                         // Allocate new memory to prevent collisions during memory
1994                         // reusing (see https://github.com/opencv/opencv/pull/10456).
1995                         output = output.clone();
1996                         if (preferableBackend == DNN_BACKEND_OPENCV &&
1997                             IS_DNN_OPENCL_TARGET(preferableTarget))
1998                         {
1999                             std::vector<UMat> umats(1);
2000                             umat_output = umat_output.clone();
2001                             umats[0] = umat_output;
2002                             OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umats);
2003                         }
2004                         Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
2005                         int ofs = 0;
2006                         for( i = 0; i < ninputs; i++ )
2007                         {
2008                             LayerPin pin = realinputs[i];
2009                             LayerData* inp_i_data = &layers[pin.lid];
2010                             int channels_i = ld.inputBlobs[i]->size[1];
2011                             chrange[1] = Range(ofs, ofs + channels_i);
2012                             printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
2013                                    pin.oid, ofs, ofs + channels_i));
2014                             ofs += channels_i;
2015                             Mat output_slice = output(chrange);
2016                             Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
2017                             CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
2018                             Mat* oldPtr = &curr_output;
2019                             curr_output = output_slice;
2020                             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2021                             {
2022                                 std::vector<UMat> umats(inp_i_data->outputBlobsWrappers.size());
2023                                 umats[pin.oid] = umat_output(chrange);
2024                                 OpenCLBackendWrapper::update(inp_i_data->outputBlobsWrappers, umats);
2025                             }
2026                             // Layers that refer old input Mat will refer to the
2027                             // new data but the same Mat object.
2028                             CV_Assert_N(curr_output.data == output_slice.data, oldPtr == &curr_output);
2029                         }
2030                         ld.skip = true;
2031                         printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
2032                     }
2033                 }
2034             }
2035         }
2036     }
2037
2038     void allocateLayers(const std::vector<LayerPin>& blobsToKeep_)
2039     {
2040         CV_TRACE_FUNCTION();
2041
2042         MapIdToLayerData::iterator it;
2043         for (it = layers.begin(); it != layers.end(); it++)
2044             it->second.flag = 0;
2045
2046         CV_Assert(!layers[0].outputBlobs.empty());
2047         ShapesVec inputShapes;
2048         for(int i = 0; i < layers[0].outputBlobs.size(); i++)
2049         {
2050             Mat& inp = layers[0].outputBlobs[i];
2051             CV_Assert(inp.total());
2052             if (preferableBackend == DNN_BACKEND_OPENCV &&
2053                 preferableTarget == DNN_TARGET_OPENCL_FP16)
2054             {
2055                 layers[0].outputBlobs[i].create(inp.dims, inp.size, CV_16S);
2056             }
2057             inputShapes.push_back(shape(inp));
2058         }
2059         LayersShapesMap layersShapes;
2060         getLayersShapes(inputShapes, layersShapes);
2061
2062         blobManager.reset();
2063         backendWrappers.clear();
2064         // Fake references to input blobs.
2065         for (int i = 0; i < layers[0].outputBlobs.size(); ++i)
2066             blobManager.addReference(LayerPin(0, i));
2067         for (it = layers.begin(); it != layers.end(); ++it)
2068         {
2069             const LayerData& ld = it->second;
2070             blobManager.addReferences(ld.inputBlobsId);
2071         }
2072
2073         for (int i = 0; i < blobsToKeep_.size(); i++)
2074         {
2075             blobManager.addReference(blobsToKeep_[i]);
2076         }
2077
2078         for (it = layers.begin(); it != layers.end(); it++)
2079         {
2080             int lid = it->first;
2081             allocateLayer(lid, layersShapes);
2082         }
2083
2084         layersTimings.resize(lastLayerId + 1, 0);
2085         fuseLayers(blobsToKeep_);
2086     }
2087
2088     void forwardLayer(LayerData &ld)
2089     {
2090         CV_TRACE_FUNCTION();
2091
2092         Ptr<Layer> layer = ld.layerInstance;
2093
2094         TickMeter tm;
2095         tm.start();
2096
2097         if( !ld.skip )
2098         {
2099             std::map<int, Ptr<BackendNode> >::iterator it = ld.backendNodes.find(preferableBackend);
2100             if (preferableBackend == DNN_BACKEND_OPENCV || it == ld.backendNodes.end() || it->second.empty())
2101             {
2102                 if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
2103                 {
2104                     std::vector<UMat> umat_inputBlobs = OpenCLBackendWrapper::getUMatVector(ld.inputBlobsWrappers);
2105                     std::vector<UMat> umat_outputBlobs = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2106                     std::vector<UMat> umat_internalBlobs = OpenCLBackendWrapper::getUMatVector(ld.internalBlobsWrappers);
2107                     layer->forward(umat_inputBlobs,
2108                                    umat_outputBlobs,
2109                                    umat_internalBlobs);
2110                     if (DNN_CHECK_NAN_INF)
2111                     {
2112                         bool fail = false;
2113                         for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2114                         {
2115                             UMat& u = umat_outputBlobs[i];
2116                             Mat m;
2117                             if (u.depth() == CV_16S) // FP16
2118                                 convertFp16(u, m);
2119                             else
2120                                 m = u.getMat(ACCESS_READ);
2121                             if (!checkRange(m))
2122                             {
2123                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2124                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2125                                 fail = true;
2126                             }
2127                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2128                             {
2129                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2130                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2131                                 fail = true;
2132                             }
2133                         }
2134                         if (fail)
2135                         {
2136                             for (size_t i = 0; i < umat_inputBlobs.size(); ++i)
2137                             {
2138                                 UMat& u = umat_inputBlobs[i];
2139                                 Mat m;
2140                                 if (u.depth() == CV_16S) // FP16
2141                                     convertFp16(u, m);
2142                                 else
2143                                     m = u.getMat(ACCESS_READ);
2144                                 std::cout << "INPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2145                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2146                             }
2147                             for (size_t i = 0; i < umat_outputBlobs.size(); ++i)
2148                             {
2149                                 UMat& u = umat_outputBlobs[i];
2150                                 Mat m;
2151                                 if (u.depth() == CV_16S) // FP16
2152                                     convertFp16(u, m);
2153                                 else
2154                                     m = u.getMat(ACCESS_READ);
2155                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(u.type()) << " " << shape(m) << std::endl;
2156                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2157                             }
2158                             for (size_t i = 0; i < umat_internalBlobs.size(); ++i)
2159                             {
2160                                 UMat& u = umat_internalBlobs[i];
2161                                 Mat m;
2162                                 if (u.depth() == CV_16S) // FP16
2163                                     convertFp16(u, m);
2164                                 else
2165                                     m = u.getMat(ACCESS_READ);
2166                                 std::cout << "INTERNAL " << i << " " << shape(m) << std::endl;
2167                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << cv::typeToString(u.type()) << " " << m.reshape(1, 1) << std::endl;
2168                             }
2169                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2170                                 CV_Assert(!fail);
2171                         }
2172                     }
2173                     OpenCLBackendWrapper::update(ld.outputBlobsWrappers, umat_outputBlobs);
2174                 }
2175                 else
2176                 {
2177                     for (int i = 0, n = ld.inputBlobsWrappers.size(); i < n; ++i)
2178                     {
2179                         if (!ld.inputBlobsWrappers[i].empty())
2180                             ld.inputBlobsWrappers[i]->copyToHost();
2181                     }
2182
2183                     std::vector<Mat> inps(ld.inputBlobs.size());
2184                     for (int i = 0; i < ld.inputBlobs.size(); ++i)
2185                     {
2186                         inps[i] = *ld.inputBlobs[i];
2187                     }
2188                     layer->forward(inps, ld.outputBlobs, ld.internals);
2189
2190                     if (DNN_CHECK_NAN_INF)
2191                     {
2192                         bool fail = false;
2193                         for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2194                         {
2195                             const Mat& m = ld.outputBlobs[i];
2196                             if (!checkRange(m))
2197                             {
2198                                 std::cerr << "WARNING: NaN detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2199                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2200                                 fail = true;
2201                             }
2202                             else if (!checkRange(m, true, NULL, -1e6, 1e6))
2203                             {
2204                                 std::cerr << "WARNING: Inf detected in layer output: id=" << ld.id << " name=" << layer->name << std::endl;
2205                                 std::cerr << "output id=" << i << " output shape=" << shape(m) << std::endl;
2206                                 fail = true;
2207                             }
2208                         }
2209                         if (fail)
2210                         {
2211                             for (size_t i = 0; i < ld.inputBlobs.size(); ++i)
2212                             {
2213                                 const Mat* pM = ld.inputBlobs[i];
2214                                 if (!pM)
2215                                 {
2216                                     std::cout << "INPUT " << i << " is NULL" << std::endl;
2217                                     continue;
2218                                 }
2219                                 const Mat& m = *pM;
2220                                 std::cout << "INPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2221                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2222                             }
2223                             for (size_t i = 0; i < ld.outputBlobs.size(); ++i)
2224                             {
2225                                 const Mat& m = ld.outputBlobs[i];
2226                                 std::cout << "OUTPUT " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2227                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2228                             }
2229                             for (size_t i = 0; i < ld.internals.size(); ++i)
2230                             {
2231                                 const Mat& m = ld.internals[i];
2232                                 std::cout << "INTERNAL " << i << " " << cv::typeToString(m.type()) << " " << shape(m) << std::endl;
2233                                 if (DNN_CHECK_NAN_INF_DUMP) std::cout << m.reshape(1, 1) << std::endl;
2234                             }
2235                             if (DNN_CHECK_NAN_INF_RAISE_ERROR)
2236                                 CV_Assert(!fail);
2237                         }
2238                     }
2239
2240                     for (int i = 0, n = ld.outputBlobsWrappers.size(); i < n; ++i)
2241                     {
2242                         if (!ld.outputBlobsWrappers[i].empty())
2243                             ld.outputBlobsWrappers[i]->setHostDirty();
2244                     }
2245                 }
2246             }
2247             else
2248             {
2249                 Ptr<BackendNode> node = it->second;
2250                 CV_Assert(!node.empty());
2251                 if (preferableBackend == DNN_BACKEND_HALIDE)
2252                 {
2253                     forwardHalide(ld.outputBlobsWrappers, node);
2254                 }
2255                 else if (preferableBackend == DNN_BACKEND_INFERENCE_ENGINE)
2256                 {
2257                     forwardInfEngine(node);
2258                 }
2259                 else
2260                 {
2261                     CV_Error(Error::StsNotImplemented, "Unknown backend identifier");
2262                 }
2263             }
2264         }
2265         else
2266             tm.reset();
2267
2268         tm.stop();
2269         layersTimings[ld.id] = tm.getTimeTicks();
2270
2271         ld.flag = 1;
2272     }
2273
2274     void forwardToLayer(LayerData &ld, bool clearFlags = true)
2275     {
2276         CV_TRACE_FUNCTION();
2277
2278         if (clearFlags)
2279         {
2280             MapIdToLayerData::iterator it;
2281             for (it = layers.begin(); it != layers.end(); it++)
2282                 it->second.flag = 0;
2283         }
2284
2285         //already was forwarded
2286         if (ld.flag)
2287             return;
2288
2289         //forward parents
2290         MapIdToLayerData::iterator it;
2291         for (it = layers.begin(); it != layers.end() && (it->second.id < ld.id); ++it)
2292         {
2293             LayerData &ld = it->second;
2294             if (ld.flag)
2295                 continue;
2296             forwardLayer(ld);
2297         }
2298
2299         //forward itself
2300         forwardLayer(ld);
2301     }
2302
2303     void forwardAll()
2304     {
2305         CV_TRACE_FUNCTION();
2306
2307         MapIdToLayerData::reverse_iterator last_layer = layers.rbegin();
2308         CV_Assert(last_layer != layers.rend());
2309         forwardToLayer(last_layer->second, true);
2310     }
2311
2312     void getLayerShapesRecursively(int id, LayersShapesMap& inOutShapes)
2313     {
2314         std::vector<LayerPin>& inputLayerIds = layers[id].inputBlobsId;
2315
2316         if (inOutShapes[id].in.empty())
2317         {
2318             for(int i = 0; i < inputLayerIds.size(); i++)
2319             {
2320                 int layerId = inputLayerIds[i].lid;
2321                 LayersShapesMap::iterator it =
2322                         inOutShapes.find(layerId);
2323                 if(it == inOutShapes.end() ||
2324                         it->second.out.empty())
2325                 {
2326                     getLayerShapesRecursively(layerId, inOutShapes);
2327                 }
2328                 const MatShape& shape = inOutShapes[layerId].out[inputLayerIds[i].oid];
2329                 inOutShapes[id].in.push_back(shape);
2330             }
2331         }
2332         const ShapesVec& is = inOutShapes[id].in;
2333         ShapesVec& os = inOutShapes[id].out;
2334         ShapesVec& ints = inOutShapes[id].internal;
2335         int requiredOutputs = layers[id].requiredOutputs.size();
2336         inOutShapes[id].supportInPlace =
2337                 layers[id].getLayerInstance()->getMemoryShapes(is, requiredOutputs, os, ints);
2338     }
2339
2340     void getLayersShapes(const ShapesVec& netInputShapes,
2341                          LayersShapesMap& inOutShapes)
2342     {
2343         inOutShapes.clear();
2344
2345         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2346         for (MapIdToLayerData::iterator it = layers.begin();
2347              it != layers.end(); it++)
2348         {
2349             getLayerShapesRecursively(it->first, inOutShapes);
2350         }
2351     }
2352
2353     void getLayerShapes(const ShapesVec& netInputShapes,
2354                         const int layerId,
2355                         LayerShapes& shapes)
2356     {
2357         LayersShapesMap inOutShapes;
2358         inOutShapes[0].in = netInputShapes; //insert shape for first input layer
2359         getLayerShapesRecursively(layerId, inOutShapes);
2360         shapes = inOutShapes[layerId];
2361     }
2362
2363     LayerPin getLatestLayerPin(const std::vector<LayerPin>& pins)
2364     {
2365         return *std::max_element(pins.begin(), pins.end());
2366     }
2367
2368     Mat getBlob(const LayerPin& pin)
2369     {
2370         CV_TRACE_FUNCTION();
2371
2372         if (!pin.valid())
2373             CV_Error(Error::StsObjectNotFound, "Requested blob not found");
2374
2375         LayerData &ld = layers[pin.lid];
2376         if ((size_t)pin.oid >= ld.outputBlobs.size())
2377         {
2378             CV_Error(Error::StsOutOfRange, format("Layer \"%s\" produce only %d outputs, "
2379                                            "the #%d was requested", ld.name.c_str(),
2380                                            ld.outputBlobs.size(), pin.oid));
2381         }
2382         if (preferableTarget != DNN_TARGET_CPU)
2383         {
2384             CV_Assert(!ld.outputBlobsWrappers.empty() && !ld.outputBlobsWrappers[pin.oid].empty());
2385             // Transfer data to CPU if it's require.
2386             ld.outputBlobsWrappers[pin.oid]->copyToHost();
2387         }
2388
2389         if (ld.outputBlobs[pin.oid].depth() == CV_16S)
2390         {
2391             convertFp16(ld.outputBlobs[pin.oid], output_blob);
2392             return output_blob;
2393         }
2394         else
2395             return ld.outputBlobs[pin.oid];
2396     }
2397
2398     Mat getBlob(String outputName)
2399     {
2400         return getBlob(getPinByAlias(outputName));
2401     }
2402 };
2403
2404 Net::Net() : impl(new Net::Impl)
2405 {
2406 }
2407
2408 Net Net::readFromModelOptimizer(const String& xml, const String& bin)
2409 {
2410 #ifndef HAVE_INF_ENGINE
2411     CV_Error(Error::StsError, "Build OpenCV with Inference Engine to enable loading models from Model Optimizer.");
2412 #else
2413     InferenceEngine::CNNNetReader reader;
2414     reader.ReadNetwork(xml);
2415     reader.ReadWeights(bin);
2416
2417     InferenceEngine::CNNNetwork ieNet = reader.getNetwork();
2418
2419     std::vector<String> inputsNames;
2420     for (auto& it : ieNet.getInputsInfo())
2421     {
2422         inputsNames.push_back(it.first);
2423     }
2424
2425     Net cvNet;
2426     cvNet.setInputsNames(inputsNames);
2427
2428     Ptr<InfEngineBackendNode> backendNode(new InfEngineBackendNode(0));
2429     backendNode->net = Ptr<InfEngineBackendNet>(new InfEngineBackendNet(ieNet));
2430     for (auto& it : ieNet.getOutputsInfo())
2431     {
2432         Ptr<Layer> cvLayer(new InfEngineBackendLayer(it.second));
2433         InferenceEngine::CNNLayerPtr ieLayer = ieNet.getLayerByName(it.first.c_str());
2434         CV_Assert(ieLayer);
2435
2436         LayerParams lp;
2437         int lid = cvNet.addLayer(it.first, "", lp);
2438
2439         LayerData& ld = cvNet.impl->layers[lid];
2440         cvLayer->name = it.first;
2441         cvLayer->type = ieLayer->type;
2442         ld.layerInstance = cvLayer;
2443         ld.backendNodes[DNN_BACKEND_INFERENCE_ENGINE] = backendNode;
2444
2445         for (int i = 0; i < inputsNames.size(); ++i)
2446             cvNet.connect(0, i, lid, i);
2447     }
2448     cvNet.setPreferableBackend(DNN_BACKEND_INFERENCE_ENGINE);
2449
2450     cvNet.impl->skipInfEngineInit = true;
2451     return cvNet;
2452 #endif  // HAVE_INF_ENGINE
2453 }
2454
2455 Net::~Net()
2456 {
2457 }
2458
2459 int Net::addLayer(const String &name, const String &type, LayerParams &params)
2460 {
2461     CV_TRACE_FUNCTION();
2462
2463     if (impl->getLayerId(name) >= 0)
2464     {
2465         CV_Error(Error::StsBadArg, "Layer \"" + name + "\" already into net");
2466         return -1;
2467     }
2468
2469     int id = ++impl->lastLayerId;
2470     impl->layerNameToId.insert(std::make_pair(name, id));
2471     impl->layers.insert(std::make_pair(id, LayerData(id, name, type, params)));
2472
2473     return id;
2474 }
2475
2476 int Net::addLayerToPrev(const String &name, const String &type, LayerParams &params)
2477 {
2478     CV_TRACE_FUNCTION();
2479
2480     int prvLid = impl->lastLayerId;
2481     int newLid = this->addLayer(name, type, params);
2482     this->connect(prvLid, 0, newLid, 0);
2483     return newLid;
2484 }
2485
2486 void Net::connect(int outLayerId, int outNum, int inpLayerId, int inpNum)
2487 {
2488     CV_TRACE_FUNCTION();
2489
2490     impl->connect(outLayerId, outNum, inpLayerId, inpNum);
2491 }
2492
2493 void Net::connect(String _outPin, String _inPin)
2494 {
2495     CV_TRACE_FUNCTION();
2496
2497     LayerPin outPin = impl->getPinByAlias(_outPin);
2498     LayerPin inpPin = impl->getPinByAlias(_inPin);
2499
2500     CV_Assert(outPin.valid() && inpPin.valid());
2501
2502     impl->connect(outPin.lid, outPin.oid, inpPin.lid, inpPin.oid);
2503 }
2504
2505 Mat Net::forward(const String& outputName)
2506 {
2507     CV_TRACE_FUNCTION();
2508
2509     String layerName = outputName;
2510
2511     if (layerName.empty())
2512         layerName = getLayerNames().back();
2513
2514     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2515     impl->setUpNet(pins);
2516     impl->forwardToLayer(impl->getLayerData(layerName));
2517
2518     return impl->getBlob(layerName);
2519 }
2520
2521 void Net::forward(OutputArrayOfArrays outputBlobs, const String& outputName)
2522 {
2523     CV_TRACE_FUNCTION();
2524
2525     String layerName = outputName;
2526
2527     if (layerName.empty())
2528         layerName = getLayerNames().back();
2529
2530     std::vector<LayerPin> pins(1, impl->getPinByAlias(layerName));
2531     impl->setUpNet(pins);
2532     impl->forwardToLayer(impl->getLayerData(layerName));
2533
2534     LayerPin pin = impl->getPinByAlias(layerName);
2535     LayerData &ld = impl->layers[pin.lid];
2536
2537     if (outputBlobs.isUMat())
2538     {
2539         impl->getBlob(layerName).copyTo(outputBlobs);
2540     }
2541     else if (outputBlobs.isMat())
2542     {
2543         outputBlobs.assign(impl->getBlob(layerName));
2544     }
2545     else if (outputBlobs.isMatVector())
2546     {
2547         if (impl->preferableTarget != DNN_TARGET_CPU)
2548         {
2549             for (int i = 0; i < ld.outputBlobsWrappers.size(); ++i)
2550             {
2551                 CV_Assert(!ld.outputBlobsWrappers[i].empty());
2552                 ld.outputBlobsWrappers[i]->copyToHost();
2553             }
2554         }
2555         if (ld.outputBlobs[0].depth() == CV_32F)
2556         {
2557             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2558             outputvec = ld.outputBlobs;
2559         } else {
2560             std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2561             outputvec.resize(ld.outputBlobs.size());
2562             for (int i = 0; i < outputvec.size(); i++)
2563                 convertFp16(ld.outputBlobs[i], outputvec[i]);
2564         }
2565     }
2566     else if (outputBlobs.isUMatVector())
2567     {
2568         std::vector<UMat> & outputvec = *(std::vector<UMat> *)outputBlobs.getObj();
2569
2570         if (impl->preferableBackend == DNN_BACKEND_OPENCV &&
2571             IS_DNN_OPENCL_TARGET(impl->preferableTarget))
2572         {
2573             if (impl->preferableTarget == DNN_TARGET_OPENCL)
2574                 outputvec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2575             else if (impl->preferableTarget == DNN_TARGET_OPENCL_FP16)
2576             {
2577                 std::vector<UMat> out_vec = OpenCLBackendWrapper::getUMatVector(ld.outputBlobsWrappers);
2578                 outputvec.resize(out_vec.size());
2579                 for (int i = 0; i < out_vec.size(); i++)
2580                     convertFp16(out_vec[i], outputvec[i]);
2581             }
2582         }
2583         else
2584         {
2585             outputvec.resize(ld.outputBlobs.size());
2586             for (int i = 0; i < outputvec.size(); ++i)
2587                 ld.outputBlobs[i].copyTo(outputvec[i]);
2588         }
2589     }
2590 }
2591
2592 void Net::forward(OutputArrayOfArrays outputBlobs,
2593                   const std::vector<String>& outBlobNames)
2594 {
2595     CV_TRACE_FUNCTION();
2596
2597     std::vector<LayerPin> pins;
2598     for (int i = 0; i < outBlobNames.size(); i++)
2599     {
2600         pins.push_back(impl->getPinByAlias(outBlobNames[i]));
2601     }
2602
2603     impl->setUpNet(pins);
2604
2605     LayerPin out = impl->getLatestLayerPin(pins);
2606
2607     impl->forwardToLayer(impl->getLayerData(out.lid));
2608
2609     std::vector<Mat> matvec;
2610     for (int i = 0; i < pins.size(); i++)
2611     {
2612         matvec.push_back(impl->getBlob(pins[i]));
2613     }
2614
2615     std::vector<Mat> & outputvec = *(std::vector<Mat> *)outputBlobs.getObj();
2616     outputvec = matvec;
2617 }
2618
2619 void Net::forward(std::vector<std::vector<Mat> >& outputBlobs,
2620                      const std::vector<String>& outBlobNames)
2621 {
2622     CV_TRACE_FUNCTION();
2623
2624     std::vector<LayerPin> pins;
2625     for (int i = 0; i < outBlobNames.size(); i++)
2626     {
2627         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
2628         pins.insert(pins.end(), lp.begin(), lp.end());
2629     }
2630
2631     impl->setUpNet(pins);
2632
2633     LayerPin out = impl->getLatestLayerPin(pins);
2634
2635     impl->forwardToLayer(impl->getLayerData(out.lid));
2636
2637     outputBlobs.resize(outBlobNames.size());
2638     for (int i = 0; i < outBlobNames.size(); i++)
2639     {
2640         std::vector<LayerPin> lp = impl->getLayerOutPins(outBlobNames[i]);
2641         for (int i = 0; i < lp.size(); i++)
2642         {
2643             outputBlobs[i].push_back(impl->getBlob(lp[i]));
2644         }
2645     }
2646 }
2647
2648 void Net::setPreferableBackend(int backendId)
2649 {
2650     CV_TRACE_FUNCTION();
2651     CV_TRACE_ARG(backendId);
2652
2653     if( impl->preferableBackend != backendId )
2654     {
2655         impl->preferableBackend = backendId;
2656         impl->netWasAllocated = false;
2657         impl->clear();
2658     }
2659 }
2660
2661 void Net::setPreferableTarget(int targetId)
2662 {
2663     CV_TRACE_FUNCTION();
2664     CV_TRACE_ARG(targetId);
2665
2666     if( impl->preferableTarget != targetId )
2667     {
2668         impl->preferableTarget = targetId;
2669         if (IS_DNN_OPENCL_TARGET(targetId))
2670         {
2671 #ifndef HAVE_OPENCL
2672 #ifdef HAVE_INF_ENGINE
2673             if (impl->preferableBackend == DNN_BACKEND_OPENCV)
2674 #else
2675             if (impl->preferableBackend == DNN_BACKEND_DEFAULT ||
2676                 impl->preferableBackend == DNN_BACKEND_OPENCV)
2677 #endif  // HAVE_INF_ENGINE
2678                 impl->preferableTarget = DNN_TARGET_CPU;
2679 #else
2680             bool fp16 = ocl::Device::getDefault().isExtensionSupported("cl_khr_fp16");
2681             if (!fp16 && targetId == DNN_TARGET_OPENCL_FP16)
2682                 impl->preferableTarget = DNN_TARGET_OPENCL;
2683 #endif
2684         }
2685         impl->netWasAllocated = false;
2686         impl->clear();
2687     }
2688 }
2689
2690 void Net::setInputsNames(const std::vector<String> &inputBlobNames)
2691 {
2692     CV_TRACE_FUNCTION();
2693
2694     impl->netInputLayer->setNames(inputBlobNames);
2695 }
2696
2697 void Net::setInput(InputArray blob, const String& name, double scalefactor, const Scalar& mean)
2698 {
2699     CV_TRACE_FUNCTION();
2700     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
2701
2702     LayerPin pin;
2703     pin.lid = 0;
2704     pin.oid = impl->resolvePinOutputName(impl->getLayerData(pin.lid), name);
2705
2706     if (!pin.valid())
2707         CV_Error(Error::StsObjectNotFound, "Requested blob \"" + name + "\" not found");
2708
2709     LayerData &ld = impl->layers[pin.lid];
2710     const int numInputs = std::max(pin.oid+1, (int)ld.requiredOutputs.size());
2711     ld.outputBlobs.resize(numInputs);
2712     ld.outputBlobsWrappers.resize(numInputs);
2713     impl->netInputLayer->inputsData.resize(numInputs);
2714     impl->netInputLayer->scaleFactors.resize(numInputs);
2715     impl->netInputLayer->means.resize(numInputs);
2716
2717     MatShape prevShape = shape(impl->netInputLayer->inputsData[pin.oid]);
2718     Mat blob_ = blob.getMat();
2719     bool oldShape = prevShape == shape(blob_);
2720     if (oldShape)
2721     {
2722         blob_.copyTo(impl->netInputLayer->inputsData[pin.oid]);
2723     }
2724     else
2725     {
2726         ld.outputBlobs[pin.oid] = blob_.clone();
2727         impl->netInputLayer->inputsData[pin.oid] = ld.outputBlobs[pin.oid];
2728     }
2729
2730     if (!ld.outputBlobsWrappers[pin.oid].empty())
2731     {
2732         ld.outputBlobsWrappers[pin.oid]->setHostDirty();
2733     }
2734     impl->netInputLayer->scaleFactors[pin.oid] = scalefactor;
2735     impl->netInputLayer->means[pin.oid] = mean;
2736     impl->netWasAllocated = impl->netWasAllocated && oldShape;
2737 }
2738
2739 Mat Net::getParam(LayerId layer, int numParam)
2740 {
2741     LayerData &ld = impl->getLayerData(layer);
2742     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
2743     CV_Assert(numParam < (int)layerBlobs.size());
2744     return layerBlobs[numParam];
2745 }
2746
2747 void Net::setParam(LayerId layer, int numParam, const Mat &blob)
2748 {
2749     LayerData &ld = impl->getLayerData(layer);
2750
2751     std::vector<Mat> &layerBlobs = ld.getLayerInstance()->blobs;
2752     CV_Assert(numParam < (int)layerBlobs.size());
2753     //we don't make strong checks, use this function carefully
2754     layerBlobs[numParam] = blob;
2755 }
2756
2757 int Net::getLayerId(const String &layer)
2758 {
2759     return impl->getLayerId(layer);
2760 }
2761
2762 Ptr<Layer> Net::getLayer(LayerId layerId)
2763 {
2764     LayerData &ld = impl->getLayerData(layerId);
2765     return ld.getLayerInstance();
2766 }
2767
2768 std::vector<Ptr<Layer> > Net::getLayerInputs(LayerId layerId)
2769 {
2770     LayerData &ld = impl->getLayerData(layerId);
2771     if (!ld.layerInstance)
2772         CV_Error(Error::StsNullPtr, format("Requested layer \"%s\" was not initialized", ld.name.c_str()));
2773
2774     std::vector<Ptr<Layer> > inputLayers;
2775     inputLayers.reserve(ld.inputLayersId.size());
2776     std::set<int>::iterator it;
2777     for (it = ld.inputLayersId.begin(); it != ld.inputLayersId.end(); ++it) {
2778         inputLayers.push_back(getLayer(*it));
2779     }
2780     return inputLayers;
2781 }
2782
2783 std::vector<String> Net::getLayerNames() const
2784 {
2785     std::vector<String> res;
2786     res.reserve(impl->layers.size());
2787
2788     Impl::MapIdToLayerData::iterator it;
2789     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
2790     {
2791         if (it->second.id) //skip Data layer
2792             res.push_back(it->second.name);
2793     }
2794
2795     return res;
2796 }
2797
2798 bool Net::empty() const
2799 {
2800     return impl->layers.size() <= 1; //first layer is default Data layer
2801 }
2802
2803 std::vector<int> Net::getUnconnectedOutLayers() const
2804 {
2805     std::vector<int> layersIds;
2806
2807     Impl::MapIdToLayerData::iterator it;
2808     for (it = impl->layers.begin(); it != impl->layers.end(); it++)
2809     {
2810         int lid = it->first;
2811         LayerData &ld = it->second;
2812
2813         if (ld.requiredOutputs.size() == 0)
2814             layersIds.push_back(lid);
2815     }
2816
2817     return layersIds;
2818 }
2819
2820 std::vector<String> Net::getUnconnectedOutLayersNames() const
2821 {
2822     std::vector<int> ids = getUnconnectedOutLayers();
2823     const size_t n = ids.size();
2824     std::vector<String> names(n);
2825     for (size_t i = 0; i < n; ++i)
2826     {
2827         names[i] = impl->layers[ids[i]].name;
2828     }
2829     return names;
2830 }
2831
2832 void Net::getLayersShapes(const ShapesVec& netInputShapes,
2833                           std::vector<int>& layersIds,
2834                           std::vector<ShapesVec>& inLayersShapes,
2835                           std::vector<ShapesVec>& outLayersShapes) const
2836 {
2837     layersIds.clear();
2838     inLayersShapes.clear();
2839     outLayersShapes.clear();
2840
2841     Impl::LayersShapesMap inOutShapes;
2842     impl->getLayersShapes(netInputShapes, inOutShapes);
2843
2844     for(Impl::LayersShapesMap::const_iterator it = inOutShapes.begin();
2845         it != inOutShapes.end(); it++)
2846     {
2847         layersIds.push_back(it->first);
2848         inLayersShapes.push_back(it->second.in);
2849         outLayersShapes.push_back(it->second.out);
2850     }
2851 }
2852
2853 void Net::getLayersShapes(const MatShape& netInputShape,
2854                           std::vector<int>& layerIds,
2855                           std::vector<ShapesVec>& inLayersShapes,
2856                           std::vector<ShapesVec>& outLayersShapes) const
2857 {
2858     getLayersShapes(ShapesVec(1, netInputShape),
2859                     layerIds, inLayersShapes, outLayersShapes);
2860 }
2861
2862 void Net::getLayerShapes(const MatShape& netInputShape,
2863                          const int layerId,
2864                          ShapesVec& inLayerShapes,
2865                          ShapesVec& outLayerShapes) const
2866 {
2867     getLayerShapes(ShapesVec(1, netInputShape),
2868                    layerId, inLayerShapes, outLayerShapes);
2869
2870 }
2871
2872 void Net::getLayerShapes(const ShapesVec& netInputShapes,
2873                     const int layerId,
2874                     ShapesVec& inLayerShapes,
2875                     ShapesVec& outLayerShapes) const
2876 {
2877     LayerShapes shapes;
2878     impl->getLayerShapes(netInputShapes, layerId, shapes);
2879     inLayerShapes = shapes.in;
2880     outLayerShapes = shapes.out;
2881 }
2882
2883 int64 Net::getFLOPS(const std::vector<MatShape>& netInputShapes) const
2884 {
2885     CV_TRACE_FUNCTION();
2886
2887     int64 flops = 0;
2888     std::vector<int> ids;
2889     std::vector<std::vector<MatShape> > inShapes, outShapes;
2890     getLayersShapes(netInputShapes, ids, inShapes, outShapes);
2891     CV_Assert(inShapes.size() == outShapes.size());
2892     CV_Assert(inShapes.size() == ids.size());
2893
2894     for(int i = 0; i < ids.size(); i++)
2895     {
2896         flops += impl->layers[ids[i]].getLayerInstance()->getFLOPS(inShapes[i],
2897                                                                    outShapes[i]);
2898     }
2899
2900     return flops;
2901 }
2902
2903 int64 Net::getFLOPS(const MatShape& netInputShape) const
2904 {
2905     return getFLOPS(std::vector<MatShape>(1, netInputShape));
2906 }
2907
2908 int64 Net::getFLOPS(const int layerId,
2909               const std::vector<MatShape>& netInputShapes) const
2910 {
2911     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
2912     CV_Assert(layer != impl->layers.end());
2913
2914     LayerShapes shapes;
2915     impl->getLayerShapes(netInputShapes, layerId, shapes);
2916
2917     return layer->second.getLayerInstance()->getFLOPS(shapes.in, shapes.out);
2918 }
2919
2920 int64 Net::getFLOPS(const int layerId,
2921               const MatShape& netInputShape) const
2922 {
2923     return getFLOPS(layerId, std::vector<MatShape>(1, netInputShape));
2924 }
2925
2926 void Net::getLayerTypes(std::vector<String>& layersTypes) const
2927 {
2928     layersTypes.clear();
2929
2930     std::map<String, int> layers;
2931     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
2932          it != impl->layers.end(); it++)
2933     {
2934         if (layers.find(it->second.type) == layers.end())
2935             layers[it->second.type] = 0;
2936         layers[it->second.type]++;
2937     }
2938
2939     for (std::map<String, int>::iterator it = layers.begin();
2940          it != layers.end(); it++)
2941     {
2942         layersTypes.push_back(it->first);
2943     }
2944 }
2945
2946 int Net::getLayersCount(const String& layerType) const
2947 {
2948     int count = 0;
2949     for (Impl::MapIdToLayerData::iterator it = impl->layers.begin();
2950          it != impl->layers.end(); it++)
2951     {
2952         if (it->second.type == layerType)
2953             count++;
2954     }
2955     return count;
2956 }
2957
2958 void Net::getMemoryConsumption(const int layerId,
2959                                const std::vector<MatShape>& netInputShapes,
2960                                size_t& weights, size_t& blobs) const
2961 {
2962     CV_TRACE_FUNCTION();
2963
2964     Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerId);
2965     CV_Assert(layer != impl->layers.end());
2966
2967     weights = blobs = 0;
2968
2969     for(int i = 0; i < layer->second.params.blobs.size(); i++)
2970     {
2971         const Mat& weightsBlob = layer->second.params.blobs[i];
2972         weights += weightsBlob.total()*weightsBlob.elemSize();
2973     }
2974
2975     ShapesVec inLayerShapes, outLayerShapes;
2976     getLayerShapes(netInputShapes, layerId, inLayerShapes, outLayerShapes);
2977     for(int i = 0; i < outLayerShapes.size(); i++)
2978     {
2979         blobs += total(outLayerShapes[i]) * sizeof(float);
2980     }
2981 }
2982
2983 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
2984                                size_t& weights, size_t& blobs) const
2985 {
2986     CV_TRACE_FUNCTION();
2987
2988     std::vector<int> layerIds;
2989     std::vector<size_t> w, b;
2990     getMemoryConsumption(netInputShapes, layerIds, w, b);
2991
2992     weights = blobs = 0;
2993     for(int i = 0; i < layerIds.size(); i++)
2994     {
2995         weights += w[i];
2996         blobs += b[i];
2997     }
2998 }
2999
3000 void Net::getMemoryConsumption(const int layerId,
3001                                const MatShape& netInputShape,
3002                                size_t& weights, size_t& blobs) const
3003 {
3004     getMemoryConsumption(layerId, std::vector<MatShape>(1, netInputShape),
3005                          weights, blobs);
3006 }
3007
3008 void Net::getMemoryConsumption(const MatShape& netInputShape,
3009                                size_t& weights, size_t& blobs) const
3010 {
3011     getMemoryConsumption(std::vector<MatShape>(1, netInputShape),
3012                          weights, blobs);
3013 }
3014
3015 void Net::getMemoryConsumption(const std::vector<MatShape>& netInputShapes,
3016                                   std::vector<int>& layerIds, std::vector<size_t>& weights,
3017                                   std::vector<size_t>& blobs) const
3018 {
3019     CV_TRACE_FUNCTION();
3020
3021     layerIds.clear();
3022     weights.clear();
3023     blobs.clear();
3024
3025     std::vector<std::vector<MatShape> > inLayerShapes, outLayerShapes;
3026
3027     getLayersShapes(netInputShapes, layerIds, inLayerShapes, outLayerShapes);
3028
3029     for(int i = 0; i < layerIds.size(); i++)
3030     {
3031         int w = 0, b = 0;
3032         Impl::MapIdToLayerData::iterator layer = impl->layers.find(layerIds[i]);
3033         CV_Assert(layer != impl->layers.end());
3034
3035         for(int j = 0; j < layer->second.params.blobs.size(); j++)
3036         {
3037             const Mat& weightsBlob = layer->second.params.blobs[j];
3038             w += weightsBlob.total()*weightsBlob.elemSize();
3039         }
3040
3041         for(int j = 0; j < outLayerShapes[i].size(); j++)
3042         {
3043             b += total(outLayerShapes[i][j]) * sizeof(float);
3044         }
3045
3046         weights.push_back(w);
3047         blobs.push_back(b);
3048     }
3049 }
3050
3051 void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>& layerIds,
3052                                std::vector<size_t>& weights, std::vector<size_t>& blobs) const
3053 {
3054     getMemoryConsumption(std::vector<MatShape>(1, netInputShape), layerIds,
3055                          weights, blobs);
3056 }
3057
3058 void Net::enableFusion(bool fusion)
3059 {
3060     if( impl->fusion != fusion )
3061     {
3062         impl->fusion = fusion;
3063         impl->netWasAllocated = false;
3064         impl->clear();
3065     }
3066 }
3067
3068 void Net::setHalideScheduler(const String& scheduler)
3069 {
3070     CV_TRACE_FUNCTION();
3071     CV_TRACE_ARG_VALUE(scheduler, "scheduler", scheduler.c_str());
3072
3073     impl->halideConfigFile = scheduler;
3074 }
3075
3076 int64 Net::getPerfProfile(std::vector<double>& timings)
3077 {
3078     timings = std::vector<double>(impl->layersTimings.begin() + 1, impl->layersTimings.end());
3079     int64 total = (int64)std::accumulate(timings.begin(), timings.end(), 0.0);
3080     return total;
3081 }
3082
3083 //////////////////////////////////////////////////////////////////////////
3084
3085 Layer::Layer() { preferableTarget = DNN_TARGET_CPU; }
3086
3087 Layer::Layer(const LayerParams &params)
3088     : blobs(params.blobs), name(params.name), type(params.type)
3089 {
3090     preferableTarget = DNN_TARGET_CPU;
3091 }
3092
3093 void Layer::setParamsFrom(const LayerParams &params)
3094 {
3095     blobs = params.blobs;
3096     name = params.name;
3097     type = params.type;
3098 }
3099
3100 int Layer::inputNameToIndex(String)
3101 {
3102     return -1;
3103 }
3104
3105 int Layer::outputNameToIndex(const String&)
3106 {
3107     return 0;
3108 }
3109
3110 bool Layer::supportBackend(int backendId)
3111 {
3112     return backendId == DNN_BACKEND_OPENCV;
3113 }
3114
3115 Ptr<BackendNode> Layer::initHalide(const std::vector<Ptr<BackendWrapper> > &)
3116 {
3117     CV_Error(Error::StsNotImplemented, "Halide pipeline of " + type +
3118                                        " layers is not defined.");
3119     return Ptr<BackendNode>();
3120 }
3121
3122 Ptr<BackendNode> Layer::initInfEngine(const std::vector<Ptr<BackendWrapper> > &)
3123 {
3124     CV_Error(Error::StsNotImplemented, "Inference Engine pipeline of " + type +
3125                                        " layers is not defined.");
3126     return Ptr<BackendNode>();
3127 }
3128
3129 void Layer::applyHalideScheduler(Ptr<BackendNode>& node, const std::vector<Mat*> &inputs,
3130                                  const std::vector<Mat> &outputs, int targetId) const
3131 {
3132 #ifdef  HAVE_HALIDE
3133     CV_TRACE_FUNCTION();
3134
3135     Halide::Var x("x"), y("y"), c("c"), n("n"), co("co"), ci("ci"),
3136                 xo("xo"), xi("xi"), yo("yo"), yi("yi"), tile("tile");
3137     Halide::Func& top = node.dynamicCast<HalideBackendNode>()->funcs.back();
3138
3139     int outW, outH, outC, outN;
3140     getCanonicalSize(outputs[0].size, &outW, &outH, &outC, &outN);
3141
3142     if (targetId == DNN_TARGET_CPU)
3143     {
3144         if (outW == 1 && outH == 1)
3145         {
3146             if (outC + outN == 1)
3147                 return;
3148
3149             if (outC > 8)
3150               top.split(c, co, ci, 8)
3151                  .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3152                  .parallel(tile)
3153                  .vectorize(ci, 8);
3154             else
3155               top.fuse(x, y, tile).fuse(c, tile, tile).fuse(n, tile, tile)
3156                  .parallel(tile);
3157         }
3158         else
3159         {
3160             if (outH > 2)
3161             {
3162                 top.reorder(x, c, y)
3163                    .split(y, yo, yi, 2)
3164                    .fuse(yo, n, tile)
3165                    .parallel(tile)
3166                    .unroll(yi)
3167                    .vectorize(x, outW >= 16 ? 16 : outW);
3168             }
3169         }
3170     }
3171     else if (targetId == DNN_TARGET_OPENCL)
3172     {
3173         if (outW == 1 && outH == 1)
3174         {
3175             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : outC;
3176             top.split(c, co, ci, c_split)
3177                .fuse(x, y, tile).fuse(co, tile, tile).fuse(n, tile, tile)
3178                .gpu_blocks(tile)
3179                .gpu_threads(ci);
3180         }
3181         else
3182         {
3183             int x_split = outW > 8 ? (outW >= 32 ? 16 : 8) : outW;
3184             int y_split = outH > 8 ? (outH >= 32 ? 16 : 8) : outH;
3185             // Supported vectorization widths: 2, 3, 4, 8, 16
3186             int c_split = outC > 8 ? (outC > 16 ? 8 : 4) : std::min(4, outC);
3187             top.split(x, xo, xi, x_split).split(y, yo, yi, y_split)
3188                .split(c, co, ci, c_split)
3189                .gpu_blocks(xo, yo, co)
3190                .gpu_threads(xi, yi)
3191                .reorder(xi, yi, ci, xo, yo, co)
3192                .vectorize(ci);
3193         }
3194     }
3195     else
3196         CV_Error(Error::StsNotImplemented, "Unknown target identifier");
3197 #endif  // HAVE_HALIDE
3198 }
3199
3200 Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
3201 {
3202     return Ptr<BackendNode>();
3203 }
3204
3205 bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
3206 bool Layer::tryFuse(Ptr<Layer>&) { return false; }
3207 void Layer::getScaleShift(Mat& scale, Mat& shift) const
3208 {
3209     scale = Mat();
3210     shift = Mat();
3211 }
3212
3213 void Layer::unsetAttached()
3214 {
3215     setActivation(Ptr<ActivationLayer>());
3216 }
3217
3218 template <typename T>
3219 static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
3220 {
3221     pv.resize(v.size());
3222     for (size_t i = 0; i < v.size(); i++)
3223         pv[i] = const_cast<T*>(&v[i]);
3224 }
3225
3226 void Layer::finalize(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
3227 {
3228     CV_TRACE_FUNCTION();
3229     this->finalize((InputArrayOfArrays)inputs, (OutputArrayOfArrays)outputs);
3230 }
3231
3232 void Layer::finalize(const std::vector<Mat*> &input, std::vector<Mat> &output)
3233 {
3234     CV_UNUSED(input);CV_UNUSED(output);
3235 }
3236
3237 void Layer::finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr)
3238 {
3239     CV_TRACE_FUNCTION();
3240     std::vector<Mat> inputs, outputs;
3241     inputs_arr.getMatVector(inputs);
3242     outputs_arr.getMatVector(outputs);
3243
3244     std::vector<Mat*> inputsp;
3245     vecToPVec(inputs, inputsp);
3246     this->finalize(inputsp, outputs);
3247 }
3248
3249 std::vector<Mat> Layer::finalize(const std::vector<Mat> &inputs)
3250 {
3251     CV_TRACE_FUNCTION();
3252
3253     std::vector<Mat> outputs;
3254     this->finalize(inputs, outputs);
3255     return outputs;
3256 }
3257
3258 void Layer::forward(std::vector<Mat*> &input, std::vector<Mat> &output, std::vector<Mat> &internals)
3259 {
3260     // We kept this method for compatibility. DNN calls it now only to support users' implementations.
3261 }
3262
3263 void Layer::forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3264 {
3265     CV_TRACE_FUNCTION();
3266     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3267
3268     Layer::forward_fallback(inputs_arr, outputs_arr, internals_arr);
3269 }
3270
3271 void Layer::forward_fallback(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr)
3272 {
3273     CV_TRACE_FUNCTION();
3274     CV_TRACE_ARG_VALUE(name, "name", name.c_str());
3275
3276     if (preferableTarget == DNN_TARGET_OPENCL_FP16 && inputs_arr.depth() == CV_16S)
3277     {
3278         std::vector<UMat> inputs;
3279         std::vector<UMat> outputs;
3280         std::vector<UMat> internals;
3281
3282         std::vector<UMat> orig_inputs;
3283         std::vector<UMat> orig_outputs;
3284         std::vector<UMat> orig_internals;
3285
3286         inputs_arr.getUMatVector(orig_inputs);
3287         outputs_arr.getUMatVector(orig_outputs);
3288         internals_arr.getUMatVector(orig_internals);
3289
3290         inputs.resize(orig_inputs.size());
3291         for (size_t i = 0; i < orig_inputs.size(); i++)
3292             convertFp16(orig_inputs[i], inputs[i]);
3293
3294         outputs.resize(orig_outputs.size());
3295         for (size_t i = 0; i < orig_outputs.size(); i++)
3296             outputs[i].create(shape(orig_outputs[i]), CV_32F);
3297
3298         internals.resize(orig_internals.size());
3299         for (size_t i = 0; i < orig_internals.size(); i++)
3300             internals[i].create(shape(orig_internals[i]), CV_32F);
3301
3302         forward(inputs, outputs, internals);
3303
3304         for (size_t i = 0; i < outputs.size(); i++)
3305             convertFp16(outputs[i], orig_outputs[i]);
3306
3307         // sync results back
3308         outputs_arr.assign(orig_outputs);
3309         internals_arr.assign(orig_internals);
3310         return;
3311     }
3312     std::vector<Mat> inpvec;
3313     std::vector<Mat> outputs;
3314     std::vector<Mat> internals;
3315
3316     inputs_arr.getMatVector(inpvec);
3317     outputs_arr.getMatVector(outputs);
3318     internals_arr.getMatVector(internals);
3319
3320     std::vector<Mat*> inputs(inpvec.size());
3321     for (int i = 0; i < inpvec.size(); i++)
3322         inputs[i] = &inpvec[i];
3323
3324     this->forward(inputs, outputs, internals);
3325
3326     // sync results back
3327     outputs_arr.assign(outputs);
3328     internals_arr.assign(internals);
3329 }
3330
3331 void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
3332 {
3333     CV_TRACE_FUNCTION();
3334
3335     this->finalize(inputs, outputs);
3336     this->forward(inputs, outputs, internals);
3337 }
3338
3339 Layer::~Layer() {}
3340
3341 bool Layer::getMemoryShapes(const std::vector<MatShape> &inputs,
3342                             const int requiredOutputs,
3343                             std::vector<MatShape> &outputs,
3344                             std::vector<MatShape> &internals) const
3345 {
3346     CV_Assert(inputs.size());
3347     outputs.assign(std::max(requiredOutputs, (int)inputs.size()), inputs[0]);
3348     return false;
3349 }
3350
3351 //////////////////////////////////////////////////////////////////////////
3352
3353 static Mutex& getLayerFactoryMutex()
3354 {
3355     static Mutex* volatile instance = NULL;
3356     if (instance == NULL)
3357     {
3358         cv::AutoLock lock(getInitializationMutex());
3359         if (instance == NULL)
3360             instance = new Mutex();
3361     }
3362     return *instance;
3363 }
3364
3365 typedef std::map<String, std::vector<LayerFactory::Constructor> > LayerFactory_Impl;
3366
3367 static LayerFactory_Impl& getLayerFactoryImpl_()
3368 {
3369     static LayerFactory_Impl impl;
3370     return impl;
3371 }
3372
3373 static LayerFactory_Impl& getLayerFactoryImpl()
3374 {
3375     static LayerFactory_Impl* volatile instance = NULL;
3376     if (instance == NULL)
3377     {
3378         cv::AutoLock lock(getLayerFactoryMutex());
3379         if (instance == NULL)
3380         {
3381             instance = &getLayerFactoryImpl_();
3382             initializeLayerFactory();
3383         }
3384     }
3385     return *instance;
3386 }
3387
3388 void LayerFactory::registerLayer(const String &type, Constructor constructor)
3389 {
3390     CV_TRACE_FUNCTION();
3391     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3392
3393     cv::AutoLock lock(getLayerFactoryMutex());
3394     String type_ = type.toLowerCase();
3395     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3396
3397     if (it != getLayerFactoryImpl().end())
3398     {
3399         if (it->second.back() == constructor)
3400             CV_Error(cv::Error::StsBadArg, "Layer \"" + type_ + "\" already was registered");
3401         it->second.push_back(constructor);
3402     }
3403     getLayerFactoryImpl().insert(std::make_pair(type_, std::vector<Constructor>(1, constructor)));
3404 }
3405
3406 void LayerFactory::unregisterLayer(const String &type)
3407 {
3408     CV_TRACE_FUNCTION();
3409     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3410
3411     cv::AutoLock lock(getLayerFactoryMutex());
3412     String type_ = type.toLowerCase();
3413
3414     LayerFactory_Impl::iterator it = getLayerFactoryImpl().find(type_);
3415     if (it != getLayerFactoryImpl().end())
3416     {
3417         if (it->second.size() > 1)
3418             it->second.pop_back();
3419         else
3420             getLayerFactoryImpl().erase(it);
3421     }
3422 }
3423
3424 Ptr<Layer> LayerFactory::createLayerInstance(const String &type, LayerParams& params)
3425 {
3426     CV_TRACE_FUNCTION();
3427     CV_TRACE_ARG_VALUE(type, "type", type.c_str());
3428
3429     cv::AutoLock lock(getLayerFactoryMutex());
3430     String type_ = type.toLowerCase();
3431     LayerFactory_Impl::const_iterator it = getLayerFactoryImpl().find(type_);
3432
3433     if (it != getLayerFactoryImpl().end())
3434     {
3435         CV_Assert(!it->second.empty());
3436         return it->second.back()(params);
3437     }
3438     else
3439     {
3440         return Ptr<Layer>(); //NULL
3441     }
3442 }
3443
3444 BackendNode::BackendNode(int backendId) : backendId(backendId) {}
3445
3446 BackendNode::~BackendNode() {};
3447
3448 BackendWrapper::BackendWrapper(int backendId, int targetId)
3449     : backendId(backendId), targetId(targetId) {}
3450
3451 BackendWrapper::BackendWrapper(int targetId, const cv::Mat& m)
3452 {
3453     CV_Error(Error::StsNotImplemented,
3454              "Constructor of backend wrapper must be implemented");
3455 }
3456
3457 BackendWrapper::BackendWrapper(const Ptr<BackendWrapper>& base, const MatShape& shape)
3458 {
3459     CV_Error(Error::StsNotImplemented,
3460              "Constructor of backend wrapper must be implemented");
3461 }
3462
3463 BackendWrapper::~BackendWrapper() {}
3464
3465 Net readNet(const String& _model, const String& _config, const String& _framework)
3466 {
3467     String framework = _framework.toLowerCase();
3468     String model = _model;
3469     String config = _config;
3470     const std::string modelExt = model.substr(model.rfind('.') + 1);
3471     const std::string configExt = config.substr(config.rfind('.') + 1);
3472     if (framework == "caffe" || modelExt == "caffemodel" || configExt == "caffemodel" ||
3473                                 modelExt == "prototxt" || configExt == "prototxt")
3474     {
3475         if (modelExt == "prototxt" || configExt == "caffemodel")
3476             std::swap(model, config);
3477         return readNetFromCaffe(config, model);
3478     }
3479     if (framework == "tensorflow" || modelExt == "pb" || configExt == "pb" ||
3480                                      modelExt == "pbtxt" || configExt == "pbtxt")
3481     {
3482         if (modelExt == "pbtxt" || configExt == "pb")
3483             std::swap(model, config);
3484         return readNetFromTensorflow(model, config);
3485     }
3486     if (framework == "torch" || modelExt == "t7" || modelExt == "net" ||
3487                                 configExt == "t7" || configExt == "net")
3488     {
3489         return readNetFromTorch(model.empty() ? config : model);
3490     }
3491     if (framework == "darknet" || modelExt == "weights" || configExt == "weights" ||
3492                                   modelExt == "cfg" || configExt == "cfg")
3493     {
3494         if (modelExt == "cfg" || configExt == "weights")
3495             std::swap(model, config);
3496         return readNetFromDarknet(config, model);
3497     }
3498     if (framework == "dldt" || modelExt == "bin" || configExt == "bin" ||
3499                                modelExt == "xml" || configExt == "xml")
3500     {
3501         if (modelExt == "xml" || configExt == "bin")
3502             std::swap(model, config);
3503         return readNetFromModelOptimizer(config, model);
3504     }
3505     if (framework == "onnx" || modelExt == "onnx")
3506     {
3507         return readNetFromONNX(model);
3508     }
3509     CV_Error(Error::StsError, "Cannot determine an origin framework of files: " +
3510                                       model + (config.empty() ? "" : ", " + config));
3511 }
3512
3513 Net readNet(const String& _framework, const std::vector<uchar>& bufferModel,
3514             const std::vector<uchar>& bufferConfig)
3515 {
3516     String framework = _framework.toLowerCase();
3517     if (framework == "caffe")
3518         return readNetFromCaffe(bufferConfig, bufferModel);
3519     else if (framework == "tensorflow")
3520         return readNetFromTensorflow(bufferModel, bufferConfig);
3521     else if (framework == "darknet")
3522         return readNetFromDarknet(bufferConfig, bufferModel);
3523     else if (framework == "torch")
3524         CV_Error(Error::StsNotImplemented, "Reading Torch models from buffers");
3525     else if (framework == "dldt")
3526         CV_Error(Error::StsNotImplemented, "Reading Intel's Model Optimizer models from buffers");
3527     CV_Error(Error::StsError, "Cannot determine an origin framework with a name " + framework);
3528 }
3529
3530 Net readNetFromModelOptimizer(const String &xml, const String &bin)
3531 {
3532     return Net::readFromModelOptimizer(xml, bin);
3533 }
3534
3535 CV__DNN_EXPERIMENTAL_NS_END
3536 }} // namespace