inference-engine/src/legacy_api/src/cnn_network_int8_normalizer.cpp

   1 // Copyright (C) 2018-2020 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "cnn_network_int8_normalizer.hpp"
   6
   7 #include <data_stats.h>
   8 #include <details/ie_cnn_network_tools.h>
   9 #include <ie_common.h>
  10
  11 #include <algorithm>
  12 #include <blob_factory.hpp>
  13 #include <cassert>
  14 #include <cmath>
  15 #include <details/caseless.hpp>
  16 #include <fstream>
  17 #include <limits>
  18 #include <map>
  19 #include <memory>
  20 #include <set>
  21 #include <string>
  22 #include <utility>
  23 #include <vector>
  24
  25 #include "cnn_network_impl.hpp"
  26 #include "cnn_network_stats_impl.hpp"
  27 #include "ie_util_internal.hpp"
  28
  29 IE_SUPPRESS_DEPRECATED_START
  30
  31 using namespace std;
  32 using namespace InferenceEngine;
  33 using namespace InferenceEngine::details;
  34
  35 using StatsMap = std::map<std::string, InferenceEngine::NetworkNodeStatsPtr>;
  36
  37 CNNStatisticHelper::CNNStatisticHelper(CNNNetwork& network,
  38                                        const std::map<std::string, NetworkNodeStatsPtr>& internalNodesStats,
  39                                        int maxSign, int maxUnsign) {
  40     internalNodesStats_ = internalNodesStats;
  41     network_ = network;
  42     maxSign_ = maxSign;
  43     maxUnsign_ = maxUnsign;
  44
  45     NormalizeStatistic();
  46 }
  47
  48 bool CNNStatisticHelper::canLayerBeQuantized(CNNLayer::Ptr layer) const {
  49     // verification of existing statistic for all inputs
  50     for (const auto i : layer->insData) {
  51         if (internalNodesStats_.find(i.lock()->getCreatorLayer().lock()->name) == internalNodesStats_.end()) {
  52             return false;
  53         }
  54     }
  55     // verification if there is a statistic for output of the layer
  56     if ((layer->outData.size() > 1) && (internalNodesStats_.find(layer->name) == internalNodesStats_.end())) {
  57         return false;
  58     }
  59     return true;
  60 }
  61
  62 void CNNStatisticHelper::copyStatistics(const std::string& srcName, const std::string& dstName) {
  63     internalNodesStats_[dstName] = internalNodesStats_[srcName];
  64 }
  65
  66 bool CNNStatisticHelper::hasNegativeOutput(const std::string& layerName, int outputPort) const {
  67     // TODO(amalyshe) parameter outputPort is not used yet, logic of dedication to the port
  68     // should be implemented
  69
  70     NetworkNodeStatsPtr layerStat = internalNodesStats_.at(layerName);
  71     for (auto v : layerStat->_minOutputs) {
  72         if (v < 0.f) {
  73             return true;
  74         }
  75     }
  76     return false;
  77 }
  78
  79 InferenceEngine::Blob::Ptr CNNStatisticHelper::getInputScale(CNNLayer::Ptr layer) const {
  80     auto inDataPtr = layer->insData[0].lock();
  81     if (inDataPtr == nullptr)
  82         return nullptr;
  83     auto previousLayer = inDataPtr->getCreatorLayer().lock();
  84     std::string inputLayerName = previousLayer->name;
  85
  86     // for case when we have the only average pooling before, we need to take this
  87     // statistic from input of avg pooling to compensate work of average pooling
  88     // and to stay in int8 as much as we can
  89     if (previousLayer->type == "Pooling" &&
  90         (previousLayer->precision == Precision::I8 || previousLayer->precision == Precision::U8)) {
  91         // take input name to the pooling
  92         auto prevInDataPtr = previousLayer->insData[0].lock();
  93         if (prevInDataPtr == nullptr)
  94             return nullptr;
  95         inputLayerName = prevInDataPtr->getCreatorLayer().lock()->name;
  96     }
  97     size_t inputChannels = inDataPtr->getTensorDesc().getDims()[1];
  98     if (getStatistic(previousLayer)->_minOutputs.size() != inputChannels ||
  99         getStatistic(previousLayer)->_maxOutputs.size() != inputChannels) {
 100         THROW_IE_EXCEPTION << "min and max sizes should be equal to input channels count for " << previousLayer->name;
 101     }
 102
 103     // current normalization algorithm can have nodes with fp32 edges. it can happen only in places
 104     // of initial quantization of int8 chains. Currently adding scaleshift adds certain I8/U8 precision
 105     // but calcualtion of scales happens before adding of scale shifts.
 106     // for fixing problem with cases of not determined yet presision and for following of
 107     // quantizatoin scheme defined by normalizer, we are adding here verification of negative output
 108     // in some cases and then verify exact precision of I8/U8 on node for covering of fully determined cases
 109     int maxValue = hasNegativeOutput(previousLayer->name) ? maxSign_ : maxUnsign_;
 110     if (previousLayer->outData[0]->getPrecision() == Precision::U8) {
 111         maxValue = maxUnsign_;
 112     } else if (previousLayer->outData[0]->getPrecision() == Precision::I8) {
 113         maxValue = maxSign_;
 114     }
 115
 116     return calculateScaleFactor(inputChannels, getStatistic(previousLayer), maxValue);
 117 }
 118
 119 InferenceEngine::Blob::Ptr CNNStatisticHelper::getOutputScale(CNNLayer::Ptr layer) const {
 120     // TODO(amalyshe) for now we are looking to precision on the data node
 121     size_t outputChannels = layer->outData[0]->getTensorDesc().getDims()[1];
 122     if (layer->outData.size() != 1) {
 123         THROW_IE_EXCEPTION << "Trying to get scales after layer having multiple output ports";
 124     }
 125
 126     auto it = internalNodesStats_.find(layer->name);
 127     if (it == internalNodesStats_.end()) {
 128         return std::shared_ptr<Blob>();
 129     }
 130
 131     if (getStatistic(layer)->_minOutputs.size() != outputChannels ||
 132         getStatistic(layer)->_maxOutputs.size() != outputChannels) {
 133         THROW_IE_EXCEPTION << "min and max sizes should be equal to output channels count for " << layer->name;
 134     }
 135
 136     return calculateScaleFactor(outputChannels, getStatistic(layer),
 137                                 layer->outData[0]->getPrecision() == Precision::I8 ? maxSign_ : maxUnsign_);
 138 }
 139
 140 int CNNStatisticHelper::getMaxSignValue() const {
 141     return maxSign_;
 142 }
 143
 144 InferenceEngine::Blob::Ptr CNNStatisticHelper::calculateScaleFactor(size_t channels, NetworkNodeStatsPtr stats,
 145                                                                     int maxInt) const {
 146     if (stats->_minOutputs.size() != channels || stats->_maxOutputs.size() != channels) {
 147         THROW_IE_EXCEPTION << "min and max sizes should be equal to channels count";
 148     }
 149
 150     // Creating i-scale blob
 151     std::shared_ptr<Data> iScaleData =
 152         std::shared_ptr<Data>(new Data("scale", {Precision::FP32, {channels}, Layout::C}));
 153     auto iScale = CreateBlobFromData(iScaleData);
 154     iScale->allocate();
 155     float* iScaleMemory = static_cast<float*>(iScale->buffer());
 156
 157     for (int c = 0; c < channels; c++) {
 158         // maxc = fmax(maxc, fabs(stats[k]->_minOutputs[c]));        // TODO Check if we should take minimums into
 159         // account
 160         float maxc = fabs(stats->_maxOutputs[c]);
 161         maxc = fmax(maxc, fabs(stats->_minOutputs[c]));
 162
 163         iScaleMemory[c] = maxc / static_cast<float>(maxInt);
 164
 165         if (fabs(iScaleMemory[c]) < 1e-7) {
 166             iScaleMemory[c] = 1.0f;
 167         }
 168     }
 169     return iScale;
 170 }
 171
 172 NetworkNodeStatsPtr CNNStatisticHelper::getStatistic(CNNLayer::Ptr layer) const {
 173     // TODO(amalyshe) all logic of traversing over network and get apropriate statistics should be here
 174     // for now it is a stub
 175     auto it = internalNodesStats_.find(getLatestInFuse(layer)->name);
 176     if (it != internalNodesStats_.end()) {
 177         return it->second;
 178     }
 179     THROW_IE_EXCEPTION << "no stat for layer " << getLatestInFuse(layer)->name;
 180 }
 181
 182 CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const {
 183     if (layer->outData[0]->getInputTo().size() == 1 &&
 184         (CaselessEq<std::string>()(layer->outData[0]->getInputTo().begin()->second->type, "relu") ||
 185          CNNNetworkInt8Normalizer::isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second))) {
 186         return layer->outData[0]->getInputTo().begin()->second;
 187     }
 188     // Conv-Sum-ReLU fuse
 189     // We need to return original layer if it will be used as a sum parame and ReLU if
 190     // iterating over outputs of pointed layer and look for the only eltwise
 191     CNNLayer::Ptr eltwise = nullptr;
 192     if (layer->outData.size() == 1) {
 193         for (auto it : layer->outData[0]->getInputTo()) {
 194             if (CaselessEq<std::string>()(it.second->type, "eltwise")) {
 195                 if (eltwise) {
 196                     THROW_IE_EXCEPTION << "Pattern when one layer pass data to several eltwise layers are not "
 197                                           "supported in int8 quantization";
 198                 }
 199                 eltwise = it.second;
 200             }
 201         }
 202     }
 203
 204     if (eltwise) {
 205         // if current layer is not a convolution return it as finish of fuse
 206         if (!CaselessEq<std::string>()(layer->type, "convolution")) {
 207             return layer;
 208         } else {
 209             // look to the ports of eltwise
 210             if (eltwise->insData[0].lock() != nullptr
 211                     && eltwise->insData[1].lock() != nullptr
 212                     && eltwise->insData[1].lock()->getCreatorLayer().lock() == layer
 213                     && CaselessEq<std::string>()(eltwise->insData[0].lock()->getCreatorLayer().lock()->type, "convolution")
 214                     && eltwise->insData[0].lock()->getInputTo().size() == 1) {
 215                 // this is a case when two convolutions come to eltwise, the second one will be selected for fuse,
 216                 // first will be used as sum operator
 217                 return layer;
 218             }
 219             // given layer is a convolution and will be used for fuse, but we need to verify if there is ReLU after
 220             // eltwise
 221             if (eltwise->outData[0]->getInputTo().size() == 1 &&
 222                 (CaselessEq<std::string>()(eltwise->outData[0]->getInputTo().begin()->second->type, "relu") ||
 223                  CNNNetworkInt8Normalizer::isReLULikeClamp(eltwise->outData[0]->getInputTo().begin()->second))) {
 224                 return eltwise->outData[0]->getInputTo().begin()->second;
 225             }
 226             return eltwise;
 227         }
 228     }
 229
 230     return layer;
 231 }
 232
 233 void CNNStatisticHelper::NormalizeStatistic() {
 234     StatsMap newMap;
 235
 236     // In case when we have statistics in negative range when min clamped value is 0,
 237     // we are changing statistics here to non negative. This is not fully correct behaviour since
 238     // it can extend range and affect accuracy, but this approach works quite well
 239     std::vector<CNNLayerPtr> sortedLayersRC = CNNNetSortTopologically(network_);
 240     for (auto l : sortedLayersRC) {
 241         if (CNNNetworkInt8Normalizer::isReLULikeClamp(l)) {
 242             if (l->outData.size() == 1) {
 243                 size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[1];
 244                 auto oldStat = internalNodesStats_.find(l->name);
 245                 if ((oldStat != internalNodesStats_.end()) && outputChannels > 1) {
 246                     for (size_t q = 0; q < oldStat->second->_minOutputs.size(); q++) {
 247                         oldStat->second->_minOutputs[q] = 0.f;
 248                     }
 249                 }
 250             }
 251         }
 252     }
 253
 254     float dummy = 0.0f;
 255
 256     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network_);
 257     for (auto l : sortedLayers) {
 258         // if layer's statistic exists in the newMap, ignore it
 259         if (newMap.find(l->name) != newMap.end()) {
 260             continue;
 261         }
 262         // verify if layer is starter layer for propagating of statistic
 263         bool isStarterLayer = false;
 264
 265         // a case if we do not have converted statistic before the current layer
 266         // go over all inputs and verify if statistic exists for all of inputs
 267         bool allInputsHaveStatistics = true;
 268         for (auto i : l->insData) {
 269             if (newMap.find(i.lock()->getCreatorLayer().lock()->name) == newMap.end()) {
 270                 allInputsHaveStatistics = false;
 271                 break;
 272             }
 273         }
 274         // if we do not have statistic - verify who is consumer of this layer
 275         if (!allInputsHaveStatistics) {
 276             if (l->outData.size() == 1) {
 277                 for (auto it : l->outData[0]->getInputTo()) {
 278                     if (CaselessEq<std::string>()(it.second->type, "scaleshift") ||
 279                         CaselessEq<std::string>()(it.second->type, "convolution") ||
 280                         CaselessEq<std::string>()(it.second->type, "fullyconnected")) {
 281                         isStarterLayer = true;
 282                         break;
 283                     }
 284                 }
 285             }
 286         } else {
 287             isStarterLayer = true;
 288         }
 289         if (CaselessEq<std::string>()(l->type, "scaleshift") || CaselessEq<std::string>()(l->type, "convolution") ||
 290             CaselessEq<std::string>()(l->type, "fullyconnected")) {
 291             isStarterLayer = true;
 292         }
 293
 294         if (!isStarterLayer) {
 295             continue;
 296         }
 297
 298         // we do not support yet layers for quantization which split data
 299         if (l->outData.size() != 1) {
 300             continue;
 301         }
 302
 303         InferenceEngine::NetworkNodeStatsPtr currentStat = std::make_shared<NetworkNodeStats>();
 304
 305         bool perChannelScale = true;
 306
 307         if (CaselessEq<std::string>()(l->type, "concat") && l->outData.size() == 1 &&
 308             l->outData[0]->getTensorDesc().getDims().size() == 4 && allInputsHaveStatistics) {
 309             size_t concatLayerIdx = 0;
 310             for (int k = 0; k < l->insData.size(); k++) {
 311                 auto prevKLayer = l->insData[k].lock()->getCreatorLayer().lock();
 312                 // looking for the statistic for prevKLayer
 313                 auto kLayerStat = newMap.find(prevKLayer->name);
 314                 if (kLayerStat != newMap.end()) {
 315                     for (size_t ikStat = 0; ikStat < kLayerStat->second->_maxOutputs.size();
 316                          ikStat++, concatLayerIdx++) {
 317                         currentStat->_maxOutputs.push_back(kLayerStat->second->_maxOutputs[ikStat]);
 318                         currentStat->_minOutputs.push_back(kLayerStat->second->_minOutputs[ikStat]);
 319                     }
 320                 } else {
 321                     THROW_IE_EXCEPTION << "We have incomplete statistic for predecessors of concat layer " << l->name;
 322                 }
 323             }
 324         } else if (CaselessEq<std::string>()(l->type, "resample")) {
 325             if (l->insData.size() == 1) {
 326                 CNNLayerPtr creator = l->insData[0].lock()->getCreatorLayer().lock();
 327                 if (CaselessEq<std::string>()(creator->type, "concat")) {
 328                     auto concatStat = newMap[creator->name];
 329                     currentStat->_maxOutputs = concatStat->_maxOutputs;
 330                     currentStat->_minOutputs = concatStat->_minOutputs;
 331                     newMap[l->name] = currentStat;
 332                 } else {
 333                     auto itOld = internalNodesStats_.find(l->name);
 334                     if (itOld != internalNodesStats_.end()) {
 335                         currentStat->_maxOutputs = itOld->second->_maxOutputs;
 336                         currentStat->_minOutputs = itOld->second->_minOutputs;
 337                         newMap[l->name] = currentStat;
 338                     }
 339                 }
 340             }
 341         } else {
 342             // go over all children until we get convoluition, scaleshift, eltwise or unknown layer
 343             // layers Pooling and ReLU are passthrough
 344             // to understand the granularity of the scaling
 345             // layer concat is a layer which produce statistics and waterfall it down
 346             std::vector<CNNLayer::Ptr> toAnalyze;
 347             for (auto it : l->outData[0]->getInputTo()) {
 348                 toAnalyze.push_back(it.second);
 349             }
 350
 351             if (CaselessEq<std::string>()(l->type, "eltwise")) {
 352                 perChannelScale = false;
 353             }
 354             while (!toAnalyze.empty() && perChannelScale) {
 355                 CNNLayer::Ptr tl = toAnalyze.back();
 356                 toAnalyze.pop_back();
 357                 if (CaselessEq<std::string>()(tl->type, "pooling") || CaselessEq<std::string>()(tl->type, "relu") ||
 358                     CNNNetworkInt8Normalizer::isReLULikeClamp(tl) || CaselessEq<std::string>()(tl->type, "concat")) {
 359                     if (tl->outData.size() == 1) {
 360                         for (auto it : tl->outData[0]->getInputTo()) {
 361                             toAnalyze.push_back(it.second);
 362                         }
 363                     }
 364                 } else if (CaselessEq<std::string>()(tl->type, "convolution")) {
 365                     // verify number of groups
 366                     ConvolutionLayer* pConv = dynamic_cast<ConvolutionLayer*>(tl.get());
 367                     if (pConv == nullptr) {
 368                         THROW_IE_EXCEPTION << "Layer " << tl->name << " is not instance of ConvolutionLayer class";
 369                     }
 370                     if (pConv->_group != pConv->_out_depth) {
 371                         perChannelScale = false;
 372                     }
 373                 } else if (CaselessEq<std::string>()(tl->type, "eltwise")) {
 374                     perChannelScale = false;
 375                 }
 376             }
 377
 378             auto itOld = internalNodesStats_.find(getLatestInFuse(l)->name);
 379             if (itOld == internalNodesStats_.end()) {
 380                 itOld = internalNodesStats_.find(l->name);
 381             }
 382             if (itOld != internalNodesStats_.end()) {
 383                 if (!perChannelScale) {
 384                     currentStat->_maxOutputs.resize(itOld->second->_maxOutputs.size());
 385                     if (!itOld->second->_maxOutputs.empty()) {
 386                         float max = FLT_MIN;
 387                         DataStats::GetDataAbsMax(&itOld->second->_maxOutputs[0], itOld->second->_maxOutputs.size(),
 388                                                  max);
 389                         std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
 390                     }
 391
 392                     currentStat->_minOutputs.resize(itOld->second->_minOutputs.size());
 393                     if (!itOld->second->_minOutputs.empty()) {
 394                         float min = FLT_MAX;
 395                         DataStats::GetDataMinMax(&itOld->second->_minOutputs[0], itOld->second->_minOutputs.size(), min,
 396                                                  dummy);
 397                         std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
 398                     }
 399                 } else {
 400                     currentStat->_maxOutputs = itOld->second->_maxOutputs;
 401                     currentStat->_minOutputs = itOld->second->_minOutputs;
 402                 }
 403             }
 404
 405             if (l->outData.size() == 1) {
 406                 size_t ch_indx = l->outData[0]->getTensorDesc().getDims().size() > 1 ? 1 : 0;
 407                 size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[ch_indx];
 408                 auto oldStat = internalNodesStats_.find(l->name);
 409                 if ((oldStat != internalNodesStats_.end()) && outputChannels > 1 &&
 410                     oldStat->second->_minOutputs.size() == 1) {
 411                     auto min = oldStat->second->_minOutputs[0];
 412                     auto max = oldStat->second->_maxOutputs[0];
 413
 414                     currentStat->_minOutputs = std::vector<float>(outputChannels);
 415                     currentStat->_maxOutputs = std::vector<float>(outputChannels);
 416                     std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
 417                     std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
 418                 }
 419             }
 420         }
 421
 422         // propagate this statistic to all layers without scale in primitives
 423         if (!currentStat->_maxOutputs.empty() && !currentStat->_minOutputs.empty()) {
 424             std::vector<CNNLayer::Ptr> toAnalyze;
 425             toAnalyze.push_back(l);
 426             while (!toAnalyze.empty()) {
 427                 CNNLayer::Ptr tl = toAnalyze.back();
 428                 toAnalyze.pop_back();
 429                 newMap[tl->name] = currentStat;
 430                 if (tl->outData.size() == 1) {
 431                     for (auto it : tl->outData[0]->getInputTo()) {
 432                         if (CaselessEq<std::string>()(it.second->type, "pooling") ||
 433                             CaselessEq<std::string>()(it.second->type, "relu") ||
 434                             CNNNetworkInt8Normalizer::isReLULikeClamp(it.second)) {
 435                             toAnalyze.push_back(it.second);
 436                         }
 437                     }
 438                 }
 439             }
 440         }
 441     }
 442
 443     internalNodesStats_ = newMap;
 444 }
 445
 446 void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor,
 447                                                                size_t port) {
 448     // verify if data exists
 449     if (newLayer && successor && successor->insData.size() > port) {
 450         // get the insData
 451         DataPtr pData = successor->insData[port].lock();
 452
 453         Data* edge2 = new Data(*pData.get());
 454         DataPtr newEdge(edge2);
 455         newEdge->getInputTo().clear();
 456         newEdge->getInputTo()[successor->name] = successor;
 457         newEdge->setName(newLayer->name);
 458         newEdge->getCreatorLayer() = newLayer;
 459         successor->insData[port] = newEdge;
 460         newLayer->outData.push_back(newEdge);
 461
 462         newLayer->insData.push_back(pData);
 463         pData->getInputTo().erase(successor->name);
 464         pData->getInputTo()[newLayer->name] = newLayer;
 465     } else {
 466         THROW_IE_EXCEPTION << "Invalid argument";
 467     }
 468 }
 469
 470 CNNLayer::Ptr CNNNetworkInt8Normalizer::addU8ToI8Conversion(DataPtr data, CNNLayer::Ptr successor,
 471                                                             CNNStatisticHelper& statHelper) {
 472     if (data->getPrecision() == Precision::U8 || data->getPrecision() == Precision::I8) {
 473         size_t c = static_cast<size_t>(data->getDims()[1]);
 474
 475         std::vector<float> ssWValues;
 476         std::vector<float> ssSValues;
 477         for (auto i = 0; i < c; i++) {
 478             ssWValues.push_back(1.0f);
 479             ssSValues.push_back(0.0f);
 480         }
 481         std::string layerName = data->getCreatorLayer().lock()->name + "_Eltwise_ScaleShift_U8I8_" + successor->name;
 482         CNNLayer::Ptr newLayer = createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
 483         newLayer->precision = Precision::I8;
 484
 485         for (size_t i = 0; i < successor->insData.size(); i++) {
 486             if (successor->insData[i].lock() == data) {
 487                 AddLayerToCNNNetworkBeforeLayer(newLayer, successor, i);
 488
 489                 // update statistic to pass quantization smoothly
 490                 if (newLayer->insData[0].lock() == nullptr)
 491                     continue;
 492                 std::string inputLayerName = newLayer->insData[0].lock()->getCreatorLayer().lock()->name;
 493                 statHelper.copyStatistics(inputLayerName, layerName);
 494                 if (data->getPrecision() == Precision::U8) {
 495                     newLayer->outData[0]->setPrecision(Precision::I8);
 496                 } else {
 497                     newLayer->outData[0]->setPrecision(Precision::U8);
 498                 }
 499             }
 500         }
 501         return newLayer;
 502     }
 503     return nullptr;
 504 }
 505
 506 void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer,
 507                                                              const std::string& nextLayerName) {
 508     // verify if data exists
 509     if (pData && layer && pData->getCreatorLayer().lock() &&
 510         pData->getInputTo().find(nextLayerName) != pData->getInputTo().end()) {
 511         CNNLayerPtr nextLayer = pData->getInputTo()[nextLayerName];
 512
 513         DataPtr newEdgeAfterLayer(new Data(*pData.get()));
 514         newEdgeAfterLayer->setName(layer->name);
 515         newEdgeAfterLayer->getCreatorLayer() = layer;
 516         newEdgeAfterLayer->getInputTo().clear();
 517         newEdgeAfterLayer->getInputTo()[nextLayerName] = nextLayer;
 518         newEdgeAfterLayer->setPrecision(Precision::FP32);
 519
 520         pData->getInputTo().erase(nextLayerName);
 521         pData->getInputTo()[layer->name] = layer;
 522
 523         layer->insData.push_back(pData);
 524         layer->outData.push_back(newEdgeAfterLayer);
 525
 526         for (size_t i = 0; i < nextLayer->insData.size(); i++) {
 527             if (nextLayer->insData[i].lock() == pData) {
 528                 nextLayer->insData[i] = newEdgeAfterLayer;
 529             }
 530         }
 531     } else {
 532         THROW_IE_EXCEPTION << "Invalid argument";
 533     }
 534 }
 535
 536 void CNNNetworkInt8Normalizer::fillInScaleShift(ScaleShiftLayer* scshLayer, size_t c, float* weightsN,
 537                                                 float* weightsD) {
 538     // Setting "scales"
 539     SizeVector weightsSize = {c};
 540     TensorDesc weightsDesc(Precision::FP32, weightsSize, InferenceEngine::C);
 541     scshLayer->_weights = InferenceEngine::make_shared_blob<float>(weightsDesc);
 542     scshLayer->_weights->allocate();
 543     float* weightsData = scshLayer->_weights->buffer();
 544     for (size_t i = 0; i < c; i++) {
 545         if (weightsN == nullptr && weightsD != nullptr) {
 546             weightsData[i] = 1.0 / weightsD[i];
 547         } else if (weightsD == nullptr && weightsN != nullptr) {
 548             weightsData[i] = weightsN[i];
 549         } else if (weightsN != nullptr && weightsD != nullptr) {
 550             weightsData[i] = weightsN[i] / weightsD[i];
 551         } else {
 552             weightsData[i] = 1.0;
 553         }
 554     }
 555
 556     // Setting "shifts"
 557     SizeVector shiftsSize = {c};
 558     TensorDesc shiftsDesc(Precision::FP32, shiftsSize, InferenceEngine::C);
 559     scshLayer->_biases = InferenceEngine::make_shared_blob<float>(shiftsDesc);
 560     scshLayer->_biases->allocate();
 561     float* biasesData = scshLayer->_biases->buffer();
 562     for (size_t i = 0; i < c; i++) {
 563         biasesData[i] = 0.f;  // Setting to constant "0"
 564     }
 565 }
 566
 567 void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2,
 568                                                     CNNStatisticHelper& statHelper) {
 569     if (CaselessEq<std::string>()(layer2->type, "priorbox") ||
 570         CaselessEq<std::string>()(layer2->type, "priorboxclustered")) {
 571         return;
 572     }
 573
 574     // Searching the connection between the layers
 575     int l1_out_i = 0;
 576     for (; l1_out_i < layer1->outData.size(); l1_out_i++) {
 577         if (layer1->outData[l1_out_i]->getInputTo().find(layer2->name) !=
 578             layer1->outData[l1_out_i]->getInputTo().end()) {
 579             break;
 580         }
 581     }
 582     if (l1_out_i == layer1->outData.size()) {
 583         THROW_IE_EXCEPTION << "Can't find layer " << layer2->name << " among layer " << layer1->name << " outputs";
 584     }
 585
 586     int l2_in_i = 0;
 587     for (; l2_in_i < layer2->insData.size(); l2_in_i++) {
 588         if (layer2->insData[l2_in_i].lock() != nullptr
 589                 && layer2->insData[l2_in_i].lock()->getCreatorLayer().lock() == layer1) {
 590             break;
 591         }
 592     }
 593     if (l2_in_i == layer2->insData.size()) {
 594         THROW_IE_EXCEPTION << "Can't find layer " << layer2->name << " among layer " << layer1->name << " inputs";
 595     }
 596
 597     DataPtr outData = layer1->outData[l1_out_i];
 598
 599     Blob::Ptr oScaleBlob = nullptr;
 600     if (layer1->blobs.find("o-scale") != layer1->blobs.end()) {
 601         oScaleBlob = layer1->blobs["o-scale"];
 602     }
 603
 604     Blob::Ptr iScaleBlob = nullptr;
 605     if (layer2->blobs.find("i-scale") != layer2->blobs.end()) {
 606         iScaleBlob = layer2->blobs["i-scale"];
 607     }
 608
 609     if (iScaleBlob == nullptr && oScaleBlob == nullptr) {
 610         return;  // No multipliers found around this edge. We can't create a ScaleShift here;
 611     } else {
 612         // Creating a ScaleShiftLayer
 613         std::string prefix;
 614         float *iScaleBuffer = nullptr, *oScaleBuffer = nullptr;
 615         if (oScaleBlob != nullptr) {
 616             oScaleBuffer = static_cast<float*>(oScaleBlob->buffer());
 617             prefix += "o";
 618         }
 619         if (iScaleBlob != nullptr) {
 620             iScaleBuffer = static_cast<float*>(iScaleBlob->buffer());
 621             prefix += "i";
 622         }
 623
 624         std::string layerName = layer1->name + "_" + prefix + "ScaleShift_" + layer2->name;
 625         LayerParams ssCnnLayerParams {layerName, "ScaleShift", Precision::FP32};
 626         CNNLayerPtr ssCnnLayer(new ScaleShiftLayer(ssCnnLayerParams));
 627
 628         AddLayerToCNNNetworkAfterData(outData, ssCnnLayer, layer2->name);
 629
 630         size_t c = static_cast<size_t>(outData->getDims()[1]);
 631
 632         {
 633             ScaleShiftLayer* scshLayer = dynamic_cast<ScaleShiftLayer*>(ssCnnLayer.get());
 634             if (scshLayer == nullptr) {
 635                 THROW_IE_EXCEPTION << "Layer " << ssCnnLayer->name << " is not instance of ScaleShiftLayer class";
 636             }
 637             fillInScaleShift(scshLayer, c, oScaleBuffer, iScaleBuffer);
 638         }
 639
 640         Precision odPrecision = Precision::FP32;
 641         if (layer2->precision == Precision::I8) {
 642             odPrecision = statHelper.hasNegativeOutput(layer1->name) ? Precision::I8 : Precision::U8;
 643         }
 644         ssCnnLayer->outData[0]->setPrecision(odPrecision);
 645     }
 646 }
 647
 648 void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper) {
 649     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
 650
 651     std::vector<std::pair<CNNLayerPtr, CNNLayerPtr>> pairs;
 652
 653     for (auto iter : sortedLayers) {
 654         for (int l1_out_i = 0; l1_out_i < iter->outData.size(); l1_out_i++) {
 655             for (auto nextIter : iter->outData[l1_out_i]->getInputTo()) {
 656                 CNNLayer::Ptr next = nextIter.second;
 657
 658                 // Checking for an INT8 convolution or fully connected with FP32 output
 659                 if ((CaselessEq<std::string>()(iter->type, "Convolution") ||
 660                      CaselessEq<std::string>()(iter->type, "FullyConnected")) &&
 661                     iter->precision == Precision::I8 && next->precision == Precision::FP32 &&
 662                     iter->outData[l1_out_i]->getPrecision() == Precision::FP32) {
 663                     // Do nothing here only if iter provides data to fp32 layers
 664                     // MKLDNNPlugin will generate x8->f32 convolution
 665
 666                 } else if ((iter->precision != Precision::FP32 && next->precision == Precision::FP32) ||
 667                            (iter->precision == Precision::FP32 && next->precision != Precision::FP32)) {
 668                     pairs.push_back(std::pair<CNNLayerPtr, CNNLayerPtr>(iter, next));
 669                 }
 670             }
 671         }
 672     }
 673
 674     for (auto& pair : pairs) {
 675         AddScaleShiftBetween(net, pair.first, pair.second, statHelper);
 676     }
 677 }
 678
 679 void CNNNetworkInt8Normalizer::ClampsToReLU(CNNNetwork& net, CNNStatisticHelper& statHelper) {
 680     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
 681
 682     for (auto iter : sortedLayers) {
 683         if (isReLULikeClamp(iter) && (iter->precision == Precision::I8 || iter->precision == Precision::U8)) {
 684             std::string layerName = iter->name + "_ReLU";
 685             LayerParams ssCnnLayerParams {layerName, "ReLU", iter->precision};
 686             CNNLayerPtr ssCnnLayer(new ReLULayer(ssCnnLayerParams));
 687
 688             auto previousLayer = iter->insData[0].lock()->getCreatorLayer().lock();
 689             ssCnnLayer->insData.push_back(iter->insData[0]);
 690             if (ssCnnLayer->insData[0].lock() == nullptr)
 691                 continue;
 692             ssCnnLayer->insData[0].lock()->getInputTo().erase(iter->name);
 693             ssCnnLayer->insData[0].lock()->getInputTo()[iter->name] = ssCnnLayer;
 694
 695             ssCnnLayer->outData.push_back(iter->outData[0]);
 696             ssCnnLayer->outData[0]->getCreatorLayer() = ssCnnLayer;
 697
 698             iter->insData.clear();
 699             iter->outData.clear();
 700         }
 701     }
 702 }
 703
 704 void CNNNetworkInt8Normalizer::ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob,
 705                                               const std::vector<float>& scales) {
 706     if (scales.size() == 0 || /*srcblob->size()*/ srcSize % scales.size() != 0) {
 707         THROW_IE_EXCEPTION << "Wrong number of scale factors";
 708     }
 709
 710     size_t channels = scales.size();
 711     size_t channelSize = /*srcblob->size()*/ srcSize / channels;
 712
 713     const float* data = srcData;
 714     if (int8blob->getTensorDesc().getPrecision() == Precision::I8) {
 715         int8_t* int8data = static_cast<int8_t*>(int8blob->buffer());
 716         int minValue = std::numeric_limits<int8_t>::min();
 717         int maxValue = std::numeric_limits<int8_t>::max();
 718
 719         size_t offset;
 720
 721         float val;
 722
 723         for (size_t ch = 0; ch < channels; ch++) {
 724             offset = channelSize * ch;
 725
 726             for (size_t i = 0; i < channelSize; i++) {
 727                 val = data[offset + i] * scales[ch];
 728
 729                 if (val > maxValue) {
 730                     val = maxValue;
 731                 } else if (val < minValue) {
 732                     val = minValue;
 733                 }
 734
 735                 int8data[offset + i] = round(val);
 736             }
 737         }
 738     } else if (int8blob->getTensorDesc().getPrecision() == Precision::I32) {
 739         int32_t* int32data = static_cast<int32_t*>(int8blob->buffer());
 740         int maxValue = std::numeric_limits<int32_t>::max();
 741         int minValue = std::numeric_limits<int32_t>::min();
 742
 743         size_t offset;
 744
 745         float val;
 746
 747         for (size_t ch = 0; ch < channels; ch++) {
 748             offset = channelSize * ch;
 749
 750             for (size_t i = 0; i < channelSize; i++) {
 751                 val = data[offset + i] * scales[ch];
 752
 753                 if (val > maxValue) {
 754                     val = maxValue;
 755                 } else if (val < minValue) {
 756                     val = minValue;
 757                 }
 758
 759                 int32data[offset + i] = round(val);
 760             }
 761         }
 762     }
 763 }
 764
 765 CNNLayer::Ptr CNNNetworkInt8Normalizer::createDWConvolutionForScale(const std::string& layerName, size_t channels,
 766                                                                     float* ssWValues, float* ssSValues) {
 767     // create new Convolution layer
 768     LayerParams params;
 769     params.name = layerName;
 770     params.precision = Precision::FP32;
 771     params.type = "Convolution";
 772
 773     CNNLayerPtr lptr = std::make_shared<ConvolutionLayer>(params);
 774     auto* pConv = dynamic_cast<ConvolutionLayer*>(lptr.get());
 775     if (pConv == nullptr) {
 776         THROW_IE_EXCEPTION << "Layer " << lptr->name << " is not instance of ConvolutionLayer class";
 777     }
 778
 779     pConv->_kernel.insert(X_AXIS, 1);
 780     pConv->_kernel.insert(Y_AXIS, 1);
 781     pConv->_stride.insert(X_AXIS, 1);
 782     pConv->_stride.insert(Y_AXIS, 1);
 783     pConv->_padding.insert(X_AXIS, 0);
 784     pConv->_padding.insert(Y_AXIS, 0);
 785     pConv->_pads_end.insert(X_AXIS, 0);
 786     pConv->_pads_end.insert(Y_AXIS, 0);
 787     pConv->_dilation.insert(X_AXIS, 1);
 788     pConv->_dilation.insert(Y_AXIS, 1);
 789
 790     pConv->_out_depth = channels;
 791     // mkl-dnn does not have i8 depthwise convolution accepting signed i8 input
 792     // when it is available, need to uncomment below lines
 793
 794     // workaround - creation of new weights for simple convolution
 795     if (pConv->_out_depth % 16 == 0) {
 796         pConv->_group = pConv->_out_depth / 16;
 797         Blob::Ptr weights = nullptr;
 798         std::shared_ptr<Data> wData =
 799             std::shared_ptr<Data>(new Data("weights", {Precision::FP32, {pConv->_out_depth * 16}, Layout::C}));
 800         weights = CreateBlobFromData(wData);
 801         weights->allocate();
 802         float* buffer = weights->buffer().as<float*>();
 803         size_t iDist = 0, iSrc = 0;
 804         for (size_t g = 0; g < pConv->_group; g++) {
 805             for (size_t k = 0; k < 16; k++) {
 806                 for (size_t s = 0; s < 16; s++) {
 807                     buffer[iDist++] = (s == k) ? ssWValues[iSrc++] : 0.f;
 808                 }
 809             }
 810         }
 811         pConv->_weights = weights;
 812         pConv->blobs["weights"] = weights;
 813     } else {
 814         Blob::Ptr weights = nullptr;
 815         std::shared_ptr<Data> wData = std::shared_ptr<Data>(
 816             new Data("weights", {Precision::FP32, {pConv->_out_depth * pConv->_out_depth}, Layout::C}));
 817         weights = CreateBlobFromData(wData);
 818         weights->allocate();
 819         float* buffer = weights->buffer().as<float*>();
 820         for (size_t i = 0, idx = 0; i < pConv->_out_depth; i++) {
 821             for (size_t j = 0; j < pConv->_out_depth; j++) {
 822                 if (i == j) {
 823                     buffer[idx] = ssWValues[i];
 824                 } else {
 825                     buffer[idx] = 0.f;
 826                 }
 827                 idx++;
 828             }
 829         }
 830         pConv->_weights = weights;
 831         pConv->blobs["weights"] = weights;
 832         pConv->_group = 1;
 833     }
 834     // end of workaround
 835
 836     // fililng of biases
 837     Blob::Ptr biasesBlob = nullptr;
 838     std::shared_ptr<Data> bData =
 839         std::shared_ptr<Data>(new Data("biases", {Precision::FP32, {pConv->_out_depth}, Layout::C}));
 840     biasesBlob = CreateBlobFromData(bData);
 841     biasesBlob->allocate();
 842     float* bufferBiases = biasesBlob->buffer().as<float*>();
 843     for (size_t c = 0; c < pConv->_out_depth; c++) {
 844         bufferBiases[c] = ssSValues[c];
 845     }
 846     pConv->_biases = biasesBlob;
 847
 848     pConv->blobs["weights"] = pConv->_weights;
 849     pConv->blobs["biases"] = pConv->_biases;
 850     return lptr;
 851 }
 852
 853 void CNNNetworkInt8Normalizer::replaceScaleShiftByDWConvolution(CNNNetwork& net) {
 854     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
 855     for (auto layer : sortedLayers) {
 856         if (CaselessEq<std::string>()(layer->type, "scaleshift") &&
 857             layer->insData[0].lock()->getCreatorLayer().lock() &&
 858             !CaselessEq<std::string>()(layer->insData[0].lock()->getCreatorLayer().lock()->type, "input") &&
 859             layer->outData[0]->getInputTo().size() > 0) {
 860             const auto dims = layer->insData[0].lock()->getTensorDesc().getDims();
 861             // only four or five dimensions Convolution layers are supported
 862             if ((dims.size() == 4) || (dims.size() == 5)) {
 863                 // verification if this layer does not pass data to PriorBox, if it passes, we do not substitute
 864                 bool notToPriorBox = true;
 865                 for (auto o : layer->outData[0]->getInputTo()) {
 866                     if (CaselessEq<std::string>()(o.second->type, "priorbox") ||
 867                         CaselessEq<std::string>()(o.second->type, "priorboxclustered")) {
 868                         notToPriorBox = false;
 869                     }
 870                 }
 871                 if (notToPriorBox) {
 872                     ScaleShiftLayer* pSS = dynamic_cast<ScaleShiftLayer*>(layer.get());
 873                     float* ssWValues = pSS->_weights->buffer().as<float*>();
 874                     float* ssSValues = pSS->_biases->buffer().as<float*>();
 875                     CNNLayer::Ptr newLayer = createDWConvolutionForScale(
 876                         layer->name, layer->outData[0]->getTensorDesc().getDims()[1], ssWValues, ssSValues);
 877
 878                     newLayer->outData = layer->outData;
 879                     newLayer->outData[0]->getCreatorLayer() = newLayer;
 880                     newLayer->insData = layer->insData;
 881                     if (newLayer->insData[0].lock() == nullptr)
 882                         continue;
 883                     newLayer->insData[0].lock()->getInputTo().erase(layer->name);
 884                     newLayer->insData[0].lock()->getInputTo()[newLayer->name] = newLayer;
 885                 }
 886             }
 887         }
 888     }
 889 }
 890
 891 void CNNNetworkInt8Normalizer::QuantizeConvolutionOrFullyConnected(CNNLayer::Ptr target_layer,
 892                                                                    CNNStatisticHelper& statHelper) {
 893     size_t inputChannels = target_layer->insData[0].lock()->getTensorDesc().getDims()[1];
 894     size_t outputChannels = target_layer->outData[0]->getTensorDesc().getDims()[1];
 895
 896     auto iScale = statHelper.getInputScale(target_layer);
 897     if (iScale == nullptr)
 898         THROW_IE_EXCEPTION << "Layer '" << target_layer->name << "'has invalid scale";
 899
 900     target_layer->blobs["i-scale"] = iScale;
 901
 902     Blob::Ptr weights = nullptr;
 903     Blob::Ptr biases = nullptr;
 904
 905     Blob::Ptr int8weights = nullptr;
 906     Blob::Ptr int32biases = nullptr;
 907
 908     if (target_layer->blobs.find("weights") != target_layer->blobs.end()) {
 909         weights = target_layer->blobs["weights"];
 910
 911         // Creating int8 weights blob
 912         std::shared_ptr<Data> int8WeightsData =
 913             std::shared_ptr<Data>(new Data("weights", TensorDesc(Precision::I8, weights->getTensorDesc().getDims(),
 914                                                                  weights->getTensorDesc().getLayout())));
 915         int8weights = CreateBlobFromData(int8WeightsData);
 916         int8weights->allocate();
 917         target_layer->blobs["weights"] = int8weights;
 918     }
 919
 920     if (target_layer->blobs.find("biases") != target_layer->blobs.end()) {
 921         biases = target_layer->blobs["biases"];
 922
 923         // Creating int8 biases blob
 924         std::shared_ptr<Data> int32BiasesData =
 925             std::shared_ptr<Data>(new Data("biases", TensorDesc(Precision::I32, biases->getTensorDesc().getDims(),
 926                                                                 biases->getTensorDesc().getLayout())));
 927         int32biases = CreateBlobFromData(int32BiasesData);
 928         int32biases->allocate();
 929         target_layer->blobs["biases"] = int32biases;
 930     }
 931
 932     std::vector<float> weightScalers;
 933
 934     // Creating w-scale blob
 935     if (weights) {
 936         const float* weight = static_cast<const float*>(weights->buffer());
 937
 938         ConvolutionLayer* pConv1 = dynamic_cast<ConvolutionLayer*>(target_layer.get());
 939
 940         if (pConv1 != nullptr && pConv1->_group == 0) {
 941             THROW_IE_EXCEPTION << "Convolution '" << target_layer->name << "'has wrong groups number == 0";
 942         }
 943         int group = 1;
 944         if (pConv1 != nullptr && pConv1->_group != 1) {
 945             group = pConv1->_group;
 946         }
 947
 948         std::vector<float> newWeights;  // "new" weights are weights multiplied by i-scale
 949
 950         size_t W_CO = outputChannels / group, W_CI = inputChannels / group,
 951                W_HW = weights->size() / W_CI / W_CO / group;
 952
 953         {
 954             float* iScaleMemory = static_cast<float*>(iScale->buffer());
 955             for (size_t g = 0; g < group; g++) {
 956                 for (size_t co = 0; co < W_CO; co++) {
 957                     for (size_t ci = 0; ci < W_CI; ci++) {
 958                         size_t kernelBase = g * W_CO * W_CI * W_HW + co * W_CI * W_HW + ci * W_HW;
 959                         for (size_t hw = 0; hw < W_HW; hw++) {
 960                             newWeights.push_back(weight[kernelBase + hw] * iScaleMemory[g * W_CI + ci]);
 961                         }
 962                     }
 963                 }
 964             }
 965         }
 966         if (newWeights.empty())
 967             THROW_IE_EXCEPTION << "Could not quantize layer '" << target_layer->name << "'. Invalid layer parameters.";
 968         size_t outChannelSize = weights->getTensorDesc().getDims().back() / W_CO / group;
 969
 970         // Calculating weights normalization scale factor (w-scale)
 971
 972         std::set<double> individualsG;
 973         size_t co;
 974         float* weight_convolution;
 975         bool bwquantized = false;
 976         double symQuant = 0.f;
 977
 978         for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
 979              co++, weight_convolution += outChannelSize) {
 980             for (size_t i = 0; i < outChannelSize && individualsG.size() < 256; i++) {
 981                 individualsG.insert(static_cast<double>(weight_convolution[i]));
 982             }
 983         }
 984         // If we have 256 quantums for all filters in convolution, it can be already int8 quantized weights
 985         // We can support symmetric quantization
 986         // Below conditions verify if weights are symmetric quantized around 0, what are min/max borders
 987         // These parameters are required to repeat exactly the same quantum as model was trained
 988         // The algorithm of restoring min/max parameters has couple assumptions which might not work for 100%
 989         // cases. We want to explicitly define them. We assume that
 990         // 1. All convolutions have 1st quantum either from positive or negative side. See how we calculate symQuant
 991         // 2. If quantization is not symmetric, there should be quant on one of the side which demonstrate this
 992         if (individualsG.size() < 256) {
 993             // going over weights and verify that weights stay on quant positions
 994             std::set<double> intervals;
 995             double prev = 0.f;
 996             for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
 997                 if (prev) {
 998                     intervals.insert(*it - prev);
 999                 }
1000                 prev = *it;
1001             }
1002             if (!intervals.empty()) {
1003                 symQuant = *(intervals.begin());
1004             }
1005             std::set<double> divs;
1006             if (symQuant != 0.) {
1007                 prev = 0.f;
1008                 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
1009                     if (prev) {
1010                         divs.insert((*it - prev) / symQuant);
1011                     }
1012                     prev = *it;
1013                 }
1014             }
1015
1016             bwquantized = true;
1017             for (auto it3 = divs.begin(); it3 != divs.end(); it3++) {
1018                 if (fabs(round(*it3) - *it3) > 0.001) {
1019                     bwquantized = false;
1020                 }
1021             }
1022
1023             // we want to make sure that quantization is symmetric. this way we are looking for the
1024             // value in weights matching to the quant (positive or negative
1025             if (bwquantized) {
1026                 // take the minimal and maximum values on calculated symQuant and compare with data from individuals
1027                 double minCalc = symQuant * -128.0f;
1028                 double maxCalc = symQuant * 128.0f;
1029                 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
1030                     if (*it < minCalc || *it > maxCalc) {
1031                         bwquantized = false;
1032                     }
1033                 }
1034             }
1035         }
1036         if (bwquantized && symQuant != 0.0f) {
1037             float max = symQuant * 127.0f;
1038             for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
1039                  co++, weight_convolution += outChannelSize) {
1040                 float scaler = static_cast<float>(statHelper.getMaxSignValue()) / max;
1041                 weightScalers.push_back(scaler);
1042             }
1043         } else {
1044             for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
1045                  co++, weight_convolution += outChannelSize) {
1046                 float max = FLT_MIN;
1047                 DataStats::GetDataAbsMax(weight_convolution, outChannelSize, max);
1048
1049                 float scaler = static_cast<float>(statHelper.getMaxSignValue()) / max;
1050                 weightScalers.push_back(scaler);
1051             }
1052         }
1053
1054         std::shared_ptr<Data> wScaleData =
1055             std::shared_ptr<Data>(new Data("w-scale", {Precision::FP32, {outputChannels}, Layout::C}));
1056         auto wScale = CreateBlobFromData(wScaleData);
1057         wScale->allocate();
1058
1059         float* wScaleMemory = static_cast<float*>(wScale->buffer());
1060
1061         for (size_t i = 0; i < outputChannels; i++) {
1062             wScaleMemory[i] = 1.0 / weightScalers[i];
1063         }
1064         target_layer->blobs["w-scale"] = wScale;
1065
1066         auto oScale = statHelper.getOutputScale(statHelper.getLatestInFuse(target_layer));
1067         if (oScale) {
1068             // there might not be o-scale if we do not have statistic after convolution that means
1069             // returning to float precision after convolution
1070             target_layer->blobs["o-scale"] = oScale;
1071
1072             // debug scales. Need to compare with actual values in FP32 scoring
1073             target_layer->blobs["ext-scale"] = target_layer->blobs["o-scale"];
1074         } else {
1075             // we do not have statistics here, we cannot calculate requantizatin scales,
1076             // next layer will be calculated in fp32
1077             // it's time to return forcedly edge to fp32 as well
1078             target_layer->outData[0]->setPrecision(Precision::FP32);
1079         }
1080
1081         // Normalizing the weights
1082         ScaleDataToInt(&newWeights[0], weights->size(), int8weights, weightScalers);
1083     }
1084
1085     // Normalizing the biases
1086     if (biases) {
1087         const float* bias = static_cast<const float*>(biases->buffer());
1088         ScaleDataToInt(bias, biases->size(), int32biases, weightScalers);
1089     }
1090 }
1091
1092 bool CNNNetworkInt8Normalizer::layerProducesFloat(const CNNLayer::Ptr layer) {
1093     // currently we support only case of layers which have one output port
1094     if (layer->outData.size() > 1) {
1095         return false;
1096     }
1097
1098     bool consumersFP32 = true;
1099     for (const auto dOut : layer->outData[0]->getInputTo()) {
1100         if (dOut.second->precision != Precision::FP32) {
1101             consumersFP32 = false;
1102         }
1103     }
1104     return consumersFP32;
1105 }
1106
1107 void CNNNetworkInt8Normalizer::returnTailToFP32(const CNNLayer::Ptr layer) {
1108     std::set<CNNLayer::Ptr> layersToReturn;
1109     if (layerProducesFloat(layer)) {
1110         layersToReturn.insert(layer);
1111     }
1112
1113     while (!layersToReturn.empty()) {
1114         CNNLayer::Ptr layerA = *layersToReturn.begin();
1115         layersToReturn.erase(layerA);
1116         // 1. if it is Pooling layer, or concat layer, we can return it to FP32 as well
1117         // we need to return it's out data
1118         if ((CaselessEq<std::string>()(layerA->type, "pooling") || CaselessEq<std::string>()(layerA->type, "concat")) &&
1119             layerA->outData.size() == 1) {
1120             layerA->precision = Precision::FP32;
1121             layerA->outData[0]->setPrecision(Precision::FP32);
1122         }
1123
1124         if ((CaselessEq<std::string>()(layerA->type, "convolution") ||
1125              CaselessEq<std::string>()(layerA->type, "fullyconnected") ||
1126              CaselessEq<std::string>()(layerA->type, "relu") || isReLULikeClamp(layerA)) &&
1127             layerA->outData.size() == 1) {
1128             layerA->outData[0]->setPrecision(Precision::FP32);
1129             if (CaselessEq<std::string>()(layerA->type, "relu")
1130                     && layerA->insData[0].lock() != nullptr
1131                     && canLayerBeI8(layerA->insData[0].lock()->getCreatorLayer().lock())) {
1132                 layerA->precision = Precision::FP32;
1133                 layerA->insData[0].lock()->getCreatorLayer().lock()->outData[0]->setPrecision(Precision::FP32);
1134             }
1135         }
1136
1137         // adding parents for analysis
1138         if (!CaselessEq<std::string>()(layerA->type, "convolution") &&
1139             !CaselessEq<std::string>()(layerA->type, "fullyconnected")) {
1140             // for all parents, if they produce data to only FP32 layers
1141             for (auto i : layerA->insData) {
1142                 DataPtr d = i.lock();
1143                 if (d != nullptr && d->getCreatorLayer().lock()->precision != Precision::FP32 &&
1144                     (CaselessEq<std::string>()(layerA->type, "pooling") ||
1145                      CaselessEq<std::string>()(layerA->type, "relu") || isReLULikeClamp(layerA) ||
1146                      CaselessEq<std::string>()(layerA->type, "concat"))) {
1147                     if (layerProducesFloat(d->getCreatorLayer().lock())) {
1148                         layersToReturn.insert(d->getCreatorLayer().lock());
1149                     }
1150                 }
1151             }
1152         }
1153     }
1154 }
1155
1156 bool CNNNetworkInt8Normalizer::canLayerBeI8(const CNNLayer::Ptr& layer) {
1157     // fusion can happen only if initial layer supplies data to only one layer
1158     // if it sends to several layers - it is safe to execute initial layer in any precision
1159     if (layer->outData[0]->getInputTo().size() == 1) {
1160         std::string aType = layer->outData[0]->getInputTo().begin()->second->type;
1161         if (CaselessEq<std::string>()(aType, "relu")) {
1162             return true;
1163         } else if (CaselessEq<std::string>()(aType, "clamp")) {
1164             if (!isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second)) {
1165                 return false;
1166             }
1167         } else {
1168             static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations = {
1169                 "elu",  "clamp",  "tanh",        "logistic",  "square", "abs",
1170                 "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
1171             return nonSuportedActivations.find(aType) == nonSuportedActivations.end();
1172         }
1173     }
1174     return true;
1175 }
1176
1177 bool CNNNetworkInt8Normalizer::isNextFusionAllowed(const CNNLayer::Ptr& layer) {
1178     // fusion can happen only if initial layer supplies data to only one layer
1179     // if it sends to several layers - it is safe to execute initial layer in any precision
1180     if (layer->outData[0]->getInputTo().size() == 1) {
1181         std::string aType = layer->outData[0]->getInputTo().begin()->second->type;
1182         if (CaselessEq<std::string>()(aType, "relu")) {
1183             ReLULayer* rL = dynamic_cast<ReLULayer*>(layer->outData[0]->getInputTo().begin()->second.get());
1184             if (rL == nullptr) {
1185                 THROW_IE_EXCEPTION << "Layer " << layer->outData[0]->getInputTo().begin()->second->name
1186                                    << " is not instance of ReLULayer class";
1187             }
1188             if (rL->negative_slope != 0.f) {
1189                 return false;
1190             }
1191         } else if (CaselessEq<std::string>()(aType, "clamp")) {
1192             if (!isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second)) {
1193                 return false;
1194             }
1195         } else {
1196             static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations = {
1197                 "elu",  "clamp",  "tanh",        "logistic",  "square", "abs",
1198                 "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
1199             return nonSuportedActivations.find(aType) == nonSuportedActivations.end();
1200         }
1201     } else {
1202         if (CaselessEq<std::string>()(layer->type, "eltwise")) {
1203             return false;
1204         }
1205     }
1206     return true;
1207 }
1208
1209 bool CNNNetworkInt8Normalizer::isReLULikeClamp(CNNLayer::Ptr layer) {
1210     if (CaselessEq<std::string>()(layer->type, "Clamp")) {
1211         ClampLayer* clamp = dynamic_cast<ClampLayer*>(layer.get());
1212         if (clamp == nullptr) {
1213             THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << layer->name << "' to Clamp";
1214         }
1215         return clamp->min_value == 0;
1216     }
1217     return false;
1218 }
1219
1220 void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper) {
1221     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
1222
1223     // Converting layers to Int8. Calculating the multipliers if needed
1224     for (auto iter : sortedLayers) {
1225         if (iter->params.find("quantization_level") != iter->params.end() &&
1226             (iter->params["quantization_level"] == "FP32" || iter->params["quantization_level"] == "FP16")) {
1227             continue;
1228         }
1229
1230         // Legacy: FullyConnected should not be converted to Int8,
1231         // if it isn't explicitly marked to.
1232         if (iter->params.find("quantization_level") == iter->params.end() &&
1233             CaselessEq<std::string>()(iter->type, "fullyconnected")) {
1234             continue;
1235         }
1236
1237         if (!statHelper.canLayerBeQuantized(iter)) {
1238             continue;
1239         }
1240
1241         if (CaselessEq<std::string>()(iter->type, "convolution") ||
1242             CaselessEq<std::string>()(iter->type, "fullyconnected")) {
1243             if (canLayerBeI8(iter)) {
1244                 iter->precision = Precision::I8;
1245                 // we will override I8 to U8 during analysing of Conv-ReLU and Conv-Sum-ReLU fusions
1246                 iter->outData[0]->setPrecision(Precision::I8);
1247             }
1248         } else if (CaselessEq<std::string>()(iter->type, "relu") || isReLULikeClamp(iter)) {
1249             // casting to ReLU
1250             ReLULayer* rL = dynamic_cast<ReLULayer*>(iter.get());
1251             DataPtr outData = iter->outData.size() ? iter->outData[0] : nullptr;
1252             auto inputData = iter->insData[0].lock();
1253             if (inputData && inputData->getCreatorLayer().lock()->precision != Precision::FP32 &&
1254                 outData->getPrecision() == Precision::FP32) {
1255                 iter->precision = Precision::I8;
1256                 if (rL != nullptr && rL->negative_slope != 0.0f) {
1257                     outData->setPrecision(Precision::I8);
1258                 } else {
1259                     outData->setPrecision(Precision::U8);
1260                     // if convolution is a predecessor, change its data to U8 also
1261                     CNNLayer::Ptr prevLayer = inputData->getCreatorLayer().lock();
1262                     if (prevLayer && (CaselessEq<std::string>()(prevLayer->type, "convolution") ||
1263                                       CaselessEq<std::string>()(prevLayer->type, "fullyconnected") ||
1264                                       CaselessEq<std::string>()(prevLayer->type, "eltwise"))) {
1265                         if (!isNextFusionAllowed(prevLayer) && inputData->getPrecision() == Precision::I8) {
1266                             outData->setPrecision(Precision::I8);
1267                         } else {
1268                             inputData->setPrecision(Precision::U8);
1269                         }
1270                     }
1271                     // if there is a patter A0 -> Eltwise -> ReLU and Convolution -> Eltwise -> ReLU,
1272                     // need to mark data after conv as U8
1273                     if (prevLayer && CaselessEq<std::string>()(prevLayer->type, "eltwise")) {
1274                         // decising which input will be used for fusion conv-sum-relu
1275                         CNNLayer::Ptr input1 = prevLayer->insData[0].lock()->getCreatorLayer().lock();
1276                         CNNLayer::Ptr input2 = prevLayer->insData[1].lock()->getCreatorLayer().lock();
1277                         CNNLayer::Ptr convLayer = nullptr;
1278                         CNNLayer::Ptr sumLayer = nullptr;
1279
1280                         if (!CaselessEq<std::string>()(input1->type, "convolution")) {
1281                             sumLayer = input1;
1282                             convLayer = input2;
1283                         } else {
1284                             // it covers a case when both inputs are convolutions or when first input is not convolution
1285                             convLayer = input1;
1286                             sumLayer = input2;
1287                         }
1288                         convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
1289                     }
1290                 }
1291             }
1292         } else if (CaselessEq<std::string>()(iter->type, "pooling")) {
1293             auto pool = dynamic_cast<PoolingLayer*>(iter.get());
1294             if (pool == nullptr) {
1295                 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << iter->name << "' to pooling";
1296             }
1297
1298             if (pool->_type == PoolingLayer::MAX || (pool->_type == PoolingLayer::AVG && pool->outData.size() == 1)) {
1299                 auto prevLayer = iter->insData[0].lock()->getCreatorLayer().lock();
1300                 if (prevLayer && (prevLayer->precision == Precision::I8 || prevLayer->precision == Precision::U8)) {
1301                     iter->precision = Precision::I8;
1302                     iter->outData[0]->setPrecision(statHelper.hasNegativeOutput(iter->name) ? Precision::I8
1303                                                                                             : Precision::U8);
1304                 }
1305             }
1306         } else if (CaselessEq<std::string>()(iter->type, "concat")) {
1307             // we can do safe
1308             // casting to concat and take axis parameter
1309             // we can concat scales only if concat does concatination by feature maps
1310             bool axisFeatureMaps = false;
1311             auto concatLayer = dynamic_cast<ConcatLayer*>(iter.get());
1312             if (concatLayer) {
1313                 if (concatLayer->_axis == 1 && concatLayer->insData.size() &&
1314                     concatLayer->insData[0].lock()->getTensorDesc().getDims().size() == 4) {
1315                     axisFeatureMaps = true;
1316                 }
1317             } else {
1318                 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer " << iter->name << " to concat";
1319             }
1320
1321             if (axisFeatureMaps) {
1322                 // verification of input data types
1323                 bool inputFP32 = false;
1324                 bool inputI8 = false;
1325                 bool inputU8 = false;
1326
1327                 for (auto inputData : iter->insData) {
1328                     auto data = inputData.lock();
1329                     if (data->getPrecision() == Precision::FP32) {
1330                         inputFP32 = true;
1331                     } else if (data->getPrecision() == Precision::I8) {
1332                         inputI8 = true;
1333                     } else if (data->getPrecision() == Precision::U8) {
1334                         inputU8 = true;
1335                     } else {
1336                         // Is it a case of input, i.e. passing I16 to concat?
1337                         // TODO(amalyshe) to handle inputs as a separate usecase
1338                         THROW_IE_EXCEPTION << "I8 normalizer: input data has unknown precision on the edge for concat: "
1339                                            << data->getName();
1340                     }
1341                 }
1342
1343                 if (inputFP32) {
1344                     for (auto i : iter->insData) {
1345                         if (i.lock()->getCreatorLayer().lock()->precision != Precision::FP32) {
1346                             returnTailToFP32(i.lock()->getCreatorLayer().lock());
1347                         }
1348                     }
1349                 } else {
1350                     iter->precision = Precision::I8;
1351
1352                     // we set outpout precision to U8 only if all inputs are U8, in other case it will be I8
1353                     auto outputPrecision = (inputU8 && !inputI8) ? Precision::U8 : Precision::I8;
1354
1355                     // if we have mixed input for I8 and U8, we have to insert scale to edges having U8 to convert to I8
1356                     // Yes, it leads to loosing of some precision and might lead to some performance degradation
1357                     // until we have scale supporting s8/u8 input and s8/u8 output.
1358                     if (inputU8 && inputI8) {
1359                         // looking for all edges having U8
1360                         for (size_t d = 0; d < iter->insData.size(); d++) {
1361                             auto data = iter->insData[d].lock();
1362                             if (data->getPrecision() == Precision::U8) {
1363                                 const size_t c = static_cast<size_t>(data->getDims()[1]);
1364                                 std::vector<float> ssWValues(c, 1.0f);
1365                                 std::vector<float> ssSValues(c, 0.0f);
1366
1367                                 std::string layerName =
1368                                     data->getCreatorLayer().lock()->name + "_Concat_ScaleShift_U8I8_" + iter->name;
1369                                 CNNLayer::Ptr newLayer =
1370                                     createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
1371                                 newLayer->precision = Precision::I8;
1372                                 AddLayerToCNNNetworkBeforeLayer(newLayer, iter, d);
1373
1374                                 // update statistic to pass quantization smoothly
1375                                 std::string inputLayerName =
1376                                     newLayer->insData[0].lock()->getCreatorLayer().lock()->name;
1377                                 statHelper.copyStatistics(inputLayerName, layerName);
1378                                 newLayer->outData[0]->setPrecision(Precision::I8);
1379                             }
1380                         }
1381                     }
1382
1383                     if (iter->outData.size() == 1) {
1384                         for (auto&& out : iter->outData) {
1385                             out->setPrecision(outputPrecision);
1386                         }
1387                     }
1388                 }
1389             }
1390         } else if (CaselessEq<std::string>()(iter->type, "eltwise")) {
1391             // we decide which of the layers will be in int-8 mode and initialize special scale which will be used
1392             // later in "conv-sum-relu" fuse. i8 execution of eltwise always assume this fusion
1393             if (canLayerBeI8(iter)) {
1394                 if (iter->insData.size() == 2) {
1395                     CNNLayer::Ptr input1 = iter->insData[0].lock()->getCreatorLayer().lock();
1396                     CNNLayer::Ptr input2 = iter->insData[1].lock()->getCreatorLayer().lock();
1397                     if ((CaselessEq<std::string>()(input1->type, "convolution") ||
1398                          CaselessEq<std::string>()(input2->type, "convolution")) &&
1399                         !CaselessEq<std::string>()(input1->type, "concat") &&
1400                         !CaselessEq<std::string>()(input2->type, "concat") && input1->precision != Precision::FP32 &&
1401                         input2->precision != Precision::FP32) {
1402                         // understand which layer will be used for sum
1403                         CNNLayer::Ptr sumLayer = nullptr;
1404                         CNNLayer::Ptr convLayer = nullptr;
1405
1406                         if (!CaselessEq<std::string>()(input1->type, "convolution")) {
1407                             sumLayer = input1;
1408                             convLayer = input2;
1409                         } else {
1410                             // it covers a case when both inputs are convolutions or when first input is not convolution
1411                             sumLayer = input2;
1412                             convLayer = input1;
1413                         }
1414
1415                         // if we find supported activation, mark it's output as I8 or U8 depending on statistics
1416                         if (iter->outData.size() == 1 && iter->outData[0]->getInputTo().size() == 1 &&
1417                             (CaselessEq<std::string>()(iter->outData[0]->getInputTo().begin()->second->type, "ReLU") ||
1418                              CNNNetworkInt8Normalizer::isReLULikeClamp(
1419                                  iter->outData[0]->getInputTo().begin()->second))) {
1420                             auto activation = iter->outData[0]->getInputTo().begin()->second;
1421                             activation->precision = Precision::I8;
1422                             if (!statHelper.hasNegativeOutput(statHelper.getLatestInFuse(convLayer)->name)) {
1423                                 activation->outData[0]->setPrecision(Precision::U8);
1424                                 iter->outData[0]->setPrecision(Precision::U8);
1425                             } else {
1426                                 activation->outData[0]->setPrecision(Precision::I8);
1427                                 iter->outData[0]->setPrecision(Precision::I8);
1428                             }
1429                         } else {
1430                             iter->outData[0]->setPrecision(Precision::I8);
1431                         }
1432
1433                         if (convLayer->outData[0]->getTensorDesc().getPrecision() == Precision::I8) {
1434                             // verify precision on input edges before and after eltwise fusion
1435                             // if we have i8/u8 missmatch between sum layer input and conv-sum-activation output,
1436                             // then in this case we have to add requantization to i8 on sum input edge
1437                             auto latestInFuse = statHelper.getLatestInFuse(convLayer);
1438                             if (latestInFuse->outData[0]->getTensorDesc().getPrecision() == Precision::I8) {
1439                                 if (input1 == sumLayer &&
1440                                     iter->insData[0].lock()->getTensorDesc().getPrecision() == Precision::U8) {
1441                                     sumLayer = addU8ToI8Conversion(iter->insData[0].lock(), iter, statHelper);
1442                                 } else if (input2 == sumLayer &&
1443                                            iter->insData[1].lock()->getTensorDesc().getPrecision() == Precision::U8) {
1444                                     sumLayer = addU8ToI8Conversion(iter->insData[0].lock(), iter, statHelper);
1445                                 }
1446                                 if (!sumLayer) {
1447                                     THROW_IE_EXCEPTION << "I8 normalizer had to add U8->I8 conversion before "
1448                                                        << iter->name << " but failed to do this";
1449                                 }
1450                             }
1451
1452                             // mark eltwise as a I8 executable, mark out data as I8
1453                             iter->precision = Precision::I8;
1454                             convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
1455                             // calculate the only scale
1456                             Blob::Ptr sumLayerScales = statHelper.getOutputScale(statHelper.getLatestInFuse(sumLayer));
1457                             Blob::Ptr convLayerScales =
1458                                 statHelper.getOutputScale(statHelper.getLatestInFuse(convLayer));
1459                             float* sumScale = sumLayerScales->buffer().as<float*>();
1460                             float* convScale = convLayerScales->buffer().as<float*>();
1461                             for (size_t i = 0; i < sumLayerScales->size(); i++) {
1462                                 sumScale[i] /= convScale[i];
1463                             }
1464
1465                             iter->blobs["eltwise-sum-scale"] = sumLayerScales;
1466                         }
1467                     }
1468                 }
1469             } else {
1470                 // if there are convolutions are inputs to this eltwise, we forcedly move them to FP32
1471                 for (auto i : iter->insData) {
1472                     auto type = i.lock()->getCreatorLayer().lock()->type;
1473                     if (CaselessEq<std::string>()(type, "convolution") ||
1474                         CaselessEq<std::string>()(type, "fullyconnected")) {
1475                         i.lock()->getCreatorLayer().lock()->precision = Precision::FP32;
1476                         i.lock()->setPrecision(Precision::FP32);
1477                     }
1478                 }
1479             }
1480         } else if (CaselessEq<std::string>()(iter->type, "resample")) {
1481             iter->precision = Precision::I8;
1482             iter->outData[0]->setPrecision(iter->insData[0].lock()->getPrecision());
1483         }
1484     }
1485
1486     // quantization of weights/biases
1487     sortedLayers = CNNNetSortTopologically(net);
1488     for (auto iter : sortedLayers) {
1489         if (iter->precision == Precision::I8 && (CaselessEq<std::string>()(iter->type, "convolution") ||
1490                                                  CaselessEq<std::string>()(iter->type, "fullyconnected"))) {
1491             QuantizeConvolutionOrFullyConnected(iter, statHelper);
1492         }
1493     }
1494
1495     // Returning of tails to FP32 mode if optimistic approach marked them as I8
1496     // no sense to do pooling in i8, we can return just after convolution
1497     for (auto iter : sortedLayers) {
1498         // TODO(amalyshe) here is a handling of case when iter provides data to the only one next layer
1499         // need to extend to cases when it provides data to many layers
1500         if (iter->precision == Precision::I8 && iter->outData.size() == 1) {
1501             if ((iter->outData[0]->getInputTo().size() == 1 &&
1502                  iter->outData[0]->getInputTo().begin()->second->precision == Precision::FP32) ||
1503                 iter->outData[0]->getInputTo().size() == 0) {
1504                 returnTailToFP32(iter);
1505             }
1506         }
1507     }
1508 }
1509
1510 void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper) {
1511     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
1512
1513     // Moving o-scales down
1514     for (auto iter : sortedLayers) {
1515         if (iter->type == "Concat" && iter->precision == Precision::I8) {
1516             // Checking if all inputs are INT8
1517             bool all_inputs_are_int8 = true;
1518             for (int k = 0; k < iter->insData.size(); k++) {
1519                 auto prevKLayer = iter->insData[k].lock()->getCreatorLayer().lock();
1520                 if ((prevKLayer->precision != Precision::I8 && prevKLayer->precision != Precision::U8) ||
1521                     prevKLayer->blobs.find("i-concat-scale") == prevKLayer->blobs.end()) {
1522                     all_inputs_are_int8 = false;
1523                     break;
1524                 }
1525             }
1526
1527             if (all_inputs_are_int8) {
1528                 // Merging o-scales of the inputs to make one for the Concat
1529                 // Creating the o-scale for the Concat by concatenating the input concats
1530                 size_t outputChannels = iter->outData[0]->getTensorDesc().getDims()[1];
1531
1532                 std::shared_ptr<Data> oScaleData =
1533                     std::shared_ptr<Data>(new Data("o-scale", {Precision::FP32, {outputChannels}, Layout::C}));
1534                 auto oScale = CreateBlobFromData(oScaleData);
1535                 oScale->allocate();
1536
1537                 float* oScaleMemory = static_cast<float*>(oScale->buffer());
1538                 int cc = 0;
1539                 for (int in = 0; in < iter->insData.size(); in++) {
1540                     auto prevOScale = iter->insData[in].lock()->getCreatorLayer().lock()->blobs["i-concat-scale"];
1541                     float* prevOScaleMemory = static_cast<float*>(prevOScale->buffer());
1542
1543                     for (int c = 0; c < prevOScale->size(); c++) {
1544                         oScaleMemory[cc] = prevOScaleMemory[c];
1545                         cc++;
1546                     }
1547                 }
1548                 if (cc != outputChannels)
1549                     THROW_IE_EXCEPTION << "Size of o-scale after " << iter->name
1550                                        << " isn't equal to the channels count";
1551
1552                 iter->precision = Precision::I8;
1553                 iter->blobs["o-scale"] = oScale;
1554             }
1555         }
1556
1557         if (iter->blobs.find("o-scale") != iter->blobs.end()) {
1558             int int8Consumers = 0;
1559             int fp32Consumers = 0;
1560             if (iter->outData.size() > 1) {
1561                 THROW_IE_EXCEPTION << "normalization algorithm for int8 found layer having o-scale and multiple ports";
1562             }
1563             if (iter->outData.size() == 1) {
1564                 for (auto l : iter->outData[0]->getInputTo()) {
1565                     if (l.second->precision == Precision::I8 || l.second->precision == Precision::U8) {
1566                         if (CaselessEq<std::string>()(l.second->type, "Pooling") ||
1567                             CaselessEq<std::string>()(l.second->type, "ReLU") ||
1568                             CNNNetworkInt8Normalizer::isReLULikeClamp(l.second)) {
1569                             l.second->blobs["o-scale"] = iter->blobs["o-scale"];
1570                             // debug scales. Need to compare with actual values in FP32 scoring
1571                             l.second->blobs["ext-scale"] = l.second->blobs["o-scale"];
1572                             int8Consumers++;
1573                         } else if (l.second->type == "Convolution") {
1574                             l.second->blobs.erase("i-scale");
1575                             int8Consumers++;
1576                         } else if (CaselessEq<std::string>()(l.second->type, "Eltwise")) {
1577                             if (statHelper.getLatestInFuse(iter) != iter) {
1578                                 l.second->blobs["o-scale"] = iter->blobs["o-scale"];
1579                             }
1580                             int8Consumers++;
1581                         } else if ((l.second->precision == Precision::I8 || l.second->precision == Precision::U8) &&
1582                                    CaselessEq<std::string>()(l.second->type, "Resample")) {
1583                             // If resample has concat as input layer it should inherit it's
1584                             // output scale
1585                             if (l.second->insData.size() == 1) {
1586                                 CNNLayerPtr creator = l.second->insData[0].lock()->getCreatorLayer().lock();
1587                                 if (CaselessEq<std::string>()(creator->type, "Concat")) {
1588                                     l.second->blobs["o-scale"] = creator->blobs["o-scale"];
1589                                     l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"];
1590                                 }
1591                             }
1592
1593                             // No concat found, let use statistics
1594                             if (l.second->blobs.find("o-scale") == l.second->blobs.end()) {
1595                                 auto oScale = statHelper.getOutputScale(l.second);
1596                                 l.second->blobs["o-scale"] = oScale;
1597                                 l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"];
1598                             }
1599                             int8Consumers++;
1600                         } else if ((l.second->precision == Precision::I8) &&
1601                                    CaselessEq<std::string>()(l.second->type, "concat")) {
1602                             // if concat is i8, we can propagate oscale further to concat.
1603                             // The logic around o-scale assumes that if we have it in the layer after iteration
1604                             // in this loop it means that it must not be removed and we need to place
1605                             // scale. While for concat we return to one layer back and again need to analyze o-scale
1606                             // and it is not clear if we need to return o-scale or it was only for concat.
1607                             // Having all of this in mind, it's better to rename o-scale to i-concat-scale
1608                             iter->blobs["i-concat-scale"] = iter->blobs["o-scale"];
1609                             int8Consumers++;
1610                         } else {
1611                             fp32Consumers++;
1612                         }
1613                     } else if (CaselessEq<std::string>()(l.second->type, "priorbox") ||
1614                                CaselessEq<std::string>()(l.second->type, "priorboxclustered")) {
1615                     } else {
1616                         // we are leaving o-scale still for adding of scale-shift before FP32 layer
1617                         fp32Consumers++;
1618                     }
1619                 }
1620
1621                 if (iter->outData[0]->getInputTo().empty()) {
1622                     fp32Consumers++;
1623                 }
1624
1625                 if (CaselessEq<std::string>()(iter->type, "Convolution") ||
1626                     CaselessEq<std::string>()(iter->type, "FullyConnected")) {
1627                     if (int8Consumers) {
1628                         iter->blobs["oi-scale"] = iter->blobs["o-scale"];
1629                     } else {
1630                         iter->outData[0]->setPrecision(Precision::FP32);
1631                     }
1632                 }
1633                 if (!fp32Consumers) {
1634                     iter->blobs.erase("o-scale");
1635                 }
1636             }
1637         }
1638     }
1639
1640     // fixing cornercases when o-scale was propagated through linear tail but it is more efficient to leave
1641     // conversion to de-normalized values in convolution
1642     for (auto iter : sortedLayers) {
1643         if (iter->blobs.find("o-scale") != iter->blobs.end()) {
1644             // go over out data. if all outputs are fp32, continue this optimization
1645             bool canOptimize = true;
1646
1647             // current layer must not be convolution
1648             if (CaselessEq<std::string>()(iter->type, "convolution")) {
1649                 canOptimize = false;
1650             }
1651             for (auto o : iter->outData) {
1652                 for (auto ol : o->getInputTo()) {
1653                     if (ol.second->precision == Precision::I8) {
1654                         canOptimize = false;
1655                     }
1656                 }
1657             }
1658             if (!canOptimize) {
1659                 continue;
1660             }
1661             // trying to go up until convolution
1662             auto curLayer = iter;
1663             bool eliminateOScale = true;
1664             while (curLayer && curLayer->blobs.find("oi-scale") == curLayer->blobs.end() && eliminateOScale) {
1665                 if (curLayer->insData.size() == 1 && curLayer->insData[0].lock()->getCreatorLayer().lock() &&
1666                     curLayer->insData[0].lock()->getCreatorLayer().lock()->outData.size() == 1 &&
1667                     curLayer->insData[0].lock()->getInputTo().size() == 1) {
1668                     curLayer = curLayer->insData[0].lock()->getCreatorLayer().lock();
1669                     if (!CaselessEq<std::string>()(curLayer->type, "Pooling") &&
1670                         !CaselessEq<std::string>()(curLayer->type, "ReLU") && !isReLULikeClamp(curLayer) &&
1671                         !CaselessEq<std::string>()(curLayer->type, "Convolution")) {
1672                         eliminateOScale = false;
1673                     }
1674                 } else {
1675                     eliminateOScale = false;
1676                 }
1677             }
1678             if (eliminateOScale && curLayer) {
1679                 for (auto o : iter->outData) {
1680                     o->setPrecision(Precision::FP32);
1681                 }
1682                 for (auto o : curLayer->outData) {
1683                     o->setPrecision(Precision::FP32);
1684                 }
1685
1686                 curLayer->blobs.erase("oi-scale");
1687                 iter->blobs.erase("o-scale");
1688                 auto iLayer = iter;
1689                 while (iLayer != curLayer) {
1690                     if (iLayer->type == "Pooling") {
1691                         iLayer->precision = Precision::FP32;
1692                     }
1693                     iLayer = iLayer->insData[0].lock()->getCreatorLayer().lock();
1694                 }
1695             }
1696         }
1697     }
1698 }
1699
1700 std::string getBlobDimention(const Blob::Ptr blob) {
1701     size_t idx = blob->getTensorDesc().getDims().size();
1702
1703     std::stringstream blobDimention;
1704     blobDimention << "[";
1705     for (auto& dim : blob->getTensorDesc().getDims()) {
1706         blobDimention << dim << ((--idx) != 0u ? ", " : "");
1707     }
1708     blobDimention << "]";
1709
1710     return blobDimention.str();
1711 }
1712
1713 void precisionColoring(const CNNLayerPtr layer, ordered_properties& printed_properties,
1714                        ordered_properties& node_properties) {
1715     // looking for the w-scale
1716     if (layer->blobs.find("w-scale") != layer->blobs.end()) {
1717         printed_properties.insert(
1718             printed_properties.begin(),
1719             std::pair<std::string, std::string>("w-scale", getBlobDimention(layer->blobs.find("w-scale")->second)));
1720     }
1721
1722     // looking for the oi-scale
1723     if (layer->blobs.find("oi-scale") != layer->blobs.end()) {
1724         printed_properties.insert(
1725             printed_properties.begin(),
1726             std::pair<std::string, std::string>("oi-scale", getBlobDimention(layer->blobs.find("oi-scale")->second)));
1727     }
1728
1729     // looking for the o-scale
1730     if (layer->blobs.find("o-scale") != layer->blobs.end()) {
1731         printed_properties.insert(
1732             printed_properties.begin(),
1733             std::pair<std::string, std::string>("o-scale", getBlobDimention(layer->blobs.find("o-scale")->second)));
1734     }
1735     // looking for the i-scale
1736     if (layer->blobs.find("i-scale") != layer->blobs.end()) {
1737         printed_properties.insert(
1738             printed_properties.begin(),
1739             std::pair<std::string, std::string>("i-scale", getBlobDimention(layer->blobs.find("i-scale")->second)));
1740     }
1741
1742     printed_properties.insert(
1743         printed_properties.begin(),
1744         std::pair<std::string, std::string>("Precision", layer->precision == Precision::FP32 ? "FP32" : "I8"));
1745
1746     if (layer->precision == Precision::FP32) {
1747         node_properties.emplace_back("fillcolor", "#5A5DF0");
1748     } else {
1749         node_properties.emplace_back("fillcolor", "#20F608");
1750     }
1751 }
1752
1753 void CNNNetworkInt8Normalizer::NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats) {
1754     CNNNetwork cnnn(ICNNNetwork::Ptr(&network, [](void*) {}));
1755
1756     int maxSign = 0x7F;
1757     int maxUnsign = 0xFF;
1758
1759     // Applying int8-conversion
1760     StatsMap statsMap = netStats.getNodesStats();
1761
1762     CNNStatisticHelper statHelper(cnnn, statsMap, maxSign, maxUnsign);
1763
1764     replaceScaleShiftByDWConvolution(cnnn);
1765
1766     DefinesExecutionPrecision(cnnn, statHelper);
1767     PropagateScaleFactors(cnnn, statHelper);
1768     ClampsToReLU(cnnn, statHelper);
1769     AddScaleShifts(cnnn, statHelper);
1770 #ifndef NDEBUG
1771     std::ofstream file("i8_normalized.dot");
1772     saveGraphToDot(cnnn, file, precisionColoring);
1773 #endif
1774 }